def main(path2template, resultsPath, runs): import gzip tmpLP = metabolism.Metabolism(util.ImportCplex(path2template)) tmpLP.setReactionObjectiveMinimizeRest('R("R_Ec_biomass_iAF1260_core_59p81M")') include = ('R("Mhb_Transp")', 'R("Mna1b_Transp")', 'R("Mkb_Transp")', 'R("Mca2b_Transp")', 'R("Mcu2b_Transp")', 'R("Mmg2b_Transp")', 'R("Mzn2b_Transp")', 'R("Mmobdb_Transp")', 'R("Mfe2b_Transp")', 'R("Mfe3b_Transp")', 'R("Mcobalt2b_Transp")', 'R("Mmn2b_Transp")', 'R("Mclb_Transp")') lp = Almaas(copy.copy(tmpLP), alwaysInc=include) print lp.lp f = lp.generateFluxdist() for item in range(runs): f = lp.generateFluxdist() stringDump = dict2tsv(lp.currDict) + "\n" + f.tsv() print stringDump path = resultsPath + "iAf1260_fluxDist_" + str(item) + ".tsv.gz" lp.lp.initialize() gzip.open(path, 'w').write(stringDump)
def test_gzip_simple(): xdense = np.zeros((20,20)) xdense[2,3] = 2.3 xdense[4,5] = 4.5 x = SP.csc_matrix(xdense) name = 'gzip_test' expected = {'x':x} format = '4' tmpdir = mkdtemp() try: fname = pjoin(tmpdir,name) mat_stream = gzip.open(fname,mode='wb') savemat(mat_stream, expected, format=format) mat_stream.close() mat_stream = gzip.open(fname,mode='rb') actual = loadmat(mat_stream, struct_as_record=True) mat_stream.close() finally: shutil.rmtree(tmpdir) assert_array_almost_equal(actual['x'].todense(), expected['x'].todense(), err_msg=repr(actual))
def sam2mat_main(args): region_pattern = r'^[^:]+(?::\d+-\d+)?(?:,[^:]+(?::\d+-\d+)?)?$' if args.region is not None and re.search(region_pattern, args.region): regions = args.region elif args.reglist is not None: with open(args.reglist) as f: regions = [line.rstrip() for line in f] else: regions = None if args.insam is None: sam_fh = sys.stdin else: sam_fh = open(args.insam, 'r') bdata = BinnedData(args.fai, regions=regions, resolution=args.resolution) bdata.read_sam(sam_fh) sam_fh.close() if args.clean: bdata.clean() if args.ice: bdata.iterative_correction() margins = bdata.dat.sum(axis=0) #print(margins) #sys.exit() try: os.makedirs(args.outdir) except OSError as e: if e.errno != errno.EEXIST: raise(e) bin_outfile = os.path.join(args.outdir, 'bins.txt.gz') contact_outfile = os.path.join(args.outdir, 'contacts.txt.gz') matrix_outfile = os.path.join(args.outdir, 'matrix.txt.gz') bin_f = gzip.open(bin_outfile, 'wb') contact_f = gzip.open(contact_outfile, 'wb') matrix_f = gzip.open(matrix_outfile, 'wb') for i,chrom1,b1 in bdata.iter_bins(): bin_mid1 = (b1[0]+b1[1])/2 if ma.is_masked(margins[i]): margin = 0 else: margin = int(margins[i]) print('{}\t{}\t{}\t{}\t{}'.format(chrom1,0,bin_mid1,margin,int(margin>0)), file=bin_f) if bdata.cleaned: print('\t'.join(bdata.dat.data[i].astype(str)), file=matrix_f) else: print('\t'.join(bdata.dat[i].astype(str)), file=matrix_f) for j,chrom2,b2 in bdata.iter_bins(): bin_mid2 = (b2[0]+b2[1])/2 contact = bdata.dat[i,j] if j>i and not ma.is_masked(contact) and contact > 0: print('{}\t{}\t{}\t{}\t{}'.format(chrom1,bin_mid1,chrom2,bin_mid2,int(contact)), file=contact_f) bin_f.close() contact_f.close() matrix_f.close()
def par_fmt(in_f, out_f): with gzip.open(out_f, 'wb') as out: for line in gzip.open(in_f): (p1, p2, prob) = line.strip().split(' ||| ') out.write('{}\n'.format(prob)) out.write('{}\n'.format(p1)) out.write('{}\n'.format(p2))
def run(self): self.emit("elasticcurl begin") self.infile = None if self.inurl != -1 else (gzip.open( self.args.input ) if self.args.input.endswith("gz") else open( self.args.input )) self.outfile = None if self.outurl != -1 else (gzip.open(self.args.output, 'w') if self.args.output.endswith("gz") else open(self.args.output, 'w')) if self.inurl != -1 and self.args.scan: # if we're reading from elasticsearch, initiate scan mode cmd = "curl -s -XGET '" + self.args.input + "/_search?search_type=scan&scroll=10m&size=" + str(self.args.limit) + "' -d '{ \"query\" : { \"match_all\" : {} } } '"; result = json.loads(subprocess.check_output(cmd, shell=True)) self.scroll_id = result['_scroll_id'] itemsin = 0 itemsout = 0 while True: offset = itemsin * self.args.jobs + self.args.limit * self.args.id itemsread = self.get_items(self.args.limit, offset) if itemsread == 0: break itemsin += itemsread self.emit("Read " + str(itemsin) + " items total") itemswrote = self.put_items() itemsout += itemswrote self.emit("Wrote " + str(itemsout) + " items total") if self.inurl == -1: self.infile.close() if self.outurl == -1: self.outfile.close() self.emit("elasticcurl end")
def stream_for_day(self, day): cache_filename = self.cache_filename_base(day) + '.json.gz' if os.path.isfile(cache_filename): log.info('found stream in cache: %r on %s', self.keyword, day) with gzip.open(cache_filename, 'rb') as f: for mention in json.load(f): yield mention else: cache_file_dirname = os.path.dirname(cache_filename) if not os.path.isdir(cache_file_dirname): os.makedirs(cache_file_dirname) tmp_cache_filename = (cache_filename + '.%05d.tmp' % randint(0, 10000)) with gzip.open(tmp_cache_filename, 'wb') as f: f.write('[') first = True stream = super(CachingMentionCounter, self).stream_for_day(day) for mention in stream: if not first: f.write(', ') json.dump(mention, f) yield mention f.write(']') os.rename(tmp_cache_filename, cache_filename) log.info('cached stream for %r on %s', self.keyword, day)
def _get_json_data(eid=None, fpath=None): """ Returns the JSON data corresponding to the game represented by eid. If the JSON data is already on disk, it is read, decompressed and returned. Otherwise, the JSON data is downloaded from the NFL web site. If the data doesn't exist yet or there was an error, _get_json_data returns None. If eid is None, then the JSON data is read from the file at fpath. """ assert eid is not None or fpath is not None if fpath is not None: return gzip.open(fpath, 'rt').read() fpath = _jsonf % eid if os.access(fpath, os.R_OK): return gzip.open(fpath, 'rt').read() try: url = _json_base_url % (eid, eid) response = requests.get(url) response.raise_for_status() return response.text except requests.exceptions.HTTPError: pass except socket.timeout: pass return None
def convert_SRC_counter_output(read_offsets, SRC_counter_output_file_name, read_bank_queries_file_name, min_avg_coverage_threshold, max_avg_coverage_threshold): #OPEN SRC OUTPUT if "gz" in SRC_counter_output_file_name: srcfile=gzip.open(SRC_counter_output_file_name,"r") else: srcfile=open(SRC_counter_output_file_name,"r") #OPEN BANK OUTPUT #We open two streams, one of them (bankfile) is the 'query' from SRC counter (31: in the following example). This stream has less random accesses if "gz" in read_bank_queries_file_name: bankfile=gzip.open(read_bank_queries_file_name,"r") else: bankfile=open(read_bank_queries_file_name,"r") #0 3.614286 4 2 5 #id mean median min max for line in srcfile.readlines(): if line[0]=='#': #header continue line=line.rstrip() avg_coverage=float(line.split()[1]) if avg_coverage>=min_avg_coverage_threshold and avg_coverage<=max_avg_coverage_threshold: query_read_id=int(line.split()[0]) print get_read(bankfile,read_offsets[query_read_id],"cov"+line[line.index(" "):].replace(' ', '_')), srcfile.close() bankfile.close()
def setUp(self): self.tab_file_name = "__gwasutil_wheader__.txt" with gzip.open("%s.gz" % (self.tab_file_name), 'w') as gzfile: with open(self.tab_file_name, 'w') as file: header_line = "%s\n" % ("\t".join(GwasEntry.HEADER_COMPS)) file.write(header_line) gzfile.write(header_line) self.gwas_entries = [] for i in range(1, 10): self.gwas_entries.append(GwasEntry('1', 1000 * i, "rs%d " % (100000 + numpy.random.randn()))) line = "%s\n" % (self.gwas_entries[-1].to_str("\t")) file.write(line) gzfile.write(line) self.comma_file_name = "__gwasutil_wheader_c__.txt" with gzip.open("%s.gz" % (self.comma_file_name), 'w') as gzfile: with open(self.comma_file_name, 'w') as file: header_line = "%s\n" % (",".join(GwasEntry.HEADER_COMPS)) file.write(header_line) gzfile.write(header_line) self.gwas_entries = [] for i in range(1, 10): self.gwas_entries.append(GwasEntry('1', 1000 * i, "rs%d " % (100000 + numpy.random.randn()))) line = "%s\n" % (self.gwas_entries[-1].to_str(",")) file.write(line) gzfile.write(line)
def get_data(series_id, platform_id, impute = False): matrixFilename = get_matrix_filename(series_id, platform_id) # setup data for specific platform for attempt in (0, 1): try: headerRows = __getMatrixNumHeaderLines(gzip.open(matrixFilename)) na_values = ["null", "NA", "NaN", "N/A", "na", "n/a", ""] data = pd.io.parsers.read_table(gzip.open(matrixFilename), skiprows=headerRows, index_col=["ID_REF"], na_values=na_values, skipfooter=1, engine='python') except IOError as e: # In case we have corrupt file print "Failed loading %s: %s" % (matrixFilename, e) os.remove(matrixFilename) if attempt: raise matrixFilename = get_matrix_filename(series_id, platform_id) data = clean_data(data) #drop samples if len(data.columns) == 1: data = data.dropna() elif impute: data = impute_data(data) data = log_data(data) #logc data.index = data.index.astype(str) data.index.name = "probe" data.columns.name = 'gsm_name' for column in data.columns: data[column] = data[column].astype(np.float64) # data.to_csv("float64.data.csv") return data
def getData(imagePath, labelPath): imageFile, labelFile = gzip.open(os.path.join(".", imagePath), 'rb'), gzip.open(os.path.join(".", labelPath), 'rb') iMagic, iSize, rows, cols = struct.unpack('>IIII', imageFile.read(16)) lMagic, lSize = struct.unpack('>II', labelFile.read(8)) x = zeros((lSize, rows, cols), dtype=uint8) y = zeros((lSize, 1), dtype=uint8) count = 0 startTime = time() for i in range(lSize): for row in range(rows): for col in range(cols): x[i][row][col] = struct.unpack(">B", imageFile.read(1))[0] y[i] = struct.unpack(">B", labelFile.read(1))[0] count = count + 1 if count % 101 == 0: stdout.write("Image: %d/%d. Time Elapsed: %ds \r" % (i, lSize, time() - startTime)) stdout.flush() #if count > 600: # break stdout.write("\n") return (x, y)
def main(): args = options() # pdb.set_trace() if os.path.splitext(args.file_in)[1] == ".gz": f_iter = FastqGeneralIterator(gzip.open(args.file_in, "rU")) else: f_iter = FastqGeneralIterator(open(args.file_in, "rU")) if os.path.splitext(args.file_r1)[1] != ".gz" and os.path.splitext(args.file_r2)[1] != ".gz": args.file_r1 += ".gz" args.file_r2 += ".gz" r1_handle = gzip.open(args.file_r1, "wb") r2_handle = gzip.open(args.file_r2, "wb") count_r1 = 0 count_r2 = 0 for(f_id, f_seq, f_q) in f_iter: dic = {"f_id":f_id, "f_seq":f_seq, "f_q":f_q} if f_id.endswith("/1"): r1_handle.write("@{f_id}\n{f_seq}\n+\n{f_q}\n".format(**dic)) count_r1 += 1 elif f_id.endswith("/2"): r2_handle.write("@{f_id}\n{f_seq}\n+\n{f_q}\n".format(**dic)) count_r2 += 1 r1_handle.close() r2_handle.close() print("{r1_records} records written to {r1_handle}".format(r1_records=count_r1, r1_handle=args.file_r1)) print("{r2_records} records written to {r2_handle}".format(r2_records=count_r2, r2_handle=args.file_r2))
def collate_tmps(args): """ collate temp files back into 1 sample """ ## split args data, name = args ## nproc len list of chunks combs = glob.glob(os.path.join( data.dirs.fastqs, "tmp_"+name)+"_R1_*.gz") combs.sort(key=lambda x: int(x.split("_")[-1].replace(".gz", "")[0])) ## one outfile to write to handle_r1 = os.path.join(data.dirs.fastqs, name+"_R1_.fastq.gz") with gzip.open(handle_r1, 'wb') as out: for fname in combs: with gzip.open(fname) as infile: out.write(infile.read()) if "pair" in data.paramsdict["datatype"]: ## nproc len list of chunks combs = glob.glob(os.path.join( data.dirs.fastqs, "tmp_"+name)+"_R2_*.gz") combs.sort() ## one outfile to write to handle_r2 = os.path.join(data.dirs.fastqs, name+"_R2_.fastq.gz") with gzip.open(handle_r2, 'wb') as out: for fname in combs: with gzip.open(fname) as infile: out.write(infile.read())
def count_word_alignments(parallelFile, alignmentFile, lang1WordVectors, lang2WordVectors): wordAlignDict = {} lineNum = 1 for pLine, aLine in zip(gzip.open(parallelFile, 'r'),gzip.open(alignmentFile, 'r')): l1, l2 = pLine.lower().strip().split(' ||| ') l1Words, l2Words = (l1.split(), l2.split()) for wordIndexPair in aLine.strip().split(): i, j = wordIndexPair.split('-') i, j = (int(i), int(j)) ''' count alignment only if both words have word vectors ''' if l1Words[i] in lang1WordVectors and l2Words[j] in lang2WordVectors: if l2Words[j] in wordAlignDict: if l1Words[i] in wordAlignDict[l2Words[j]]: wordAlignDict[l2Words[j]][l1Words[i]] += 1 else: wordAlignDict[l2Words[j]][l1Words[i]] = 1 else: wordAlignDict[l2Words[j]] = {l1Words[i]: 1} if lineNum%10000 == 0: sys.stderr.write(str(lineNum)+' ') lineNum += 1 sys.stderr.write(str(len(wordAlignDict))+"\n") return wordAlignDict
def events(timefrom, timeto=None): timefrom, timeto = resolve_time(timefrom, timeto) hour = timedelta(0, 3600) currtime = timefrom while currtime < timeto: jsonname = currtime.date().isoformat()+'-%d.json.gz'%(currtime.hour) jsonpath = cache_dir + '/' + jsonname jsonurl = 'http://data.githubarchive.org/' + jsonname if not os.path.isfile(jsonpath): if not os.path.isdir(cache_dir): os.mkdir(cache_dir) rsp = requests.get(jsonurl) if rsp.status_code == 200: with open(jsonpath, 'wb') as fp: fp.write(rsp.content) else: with gzip.open(jsonpath, 'wb') as fp: fp.write('') with gzip.open(jsonpath, 'rb') as fp: jsontxts = fp.read().decode('utf-8', errors='ignore').splitlines() for j in map(json.loads, filter(valid_json, jsontxts)): t = parse_iso_time(j['created_at']) if t < timefrom or t > timeto: continue yield j currtime += hour
def unmerge(combined_fname, out_template, gz=False): outs = [] if gz: outs.append(gzip.open('%s.1.fastq.gz' % out_template, 'w')) else: outs.append(open('%s.1.fastq' % out_template, 'w')) outidx = 1 last_read = None fq = FASTQ(combined_fname) for read in fq.fetch(): if last_read and last_read.name == read.name: outidx += 1 if len(outs) < outidx: if gz: outs.append(gzip.open('%s.%s.fastq.gz' % (out_template, outidx), 'w')) else: outs.append(open('%s.%s.fastq' % (out_template, outidx), 'w')) read.write(outs[outidx - 1]) else: outidx = 1 read.write(outs[0]) last_read = read fq.close() for out in outs: out.close()
def test_rotate_twice(self): _rotate_files(times=2) for filename in FILES_TO_ROTATE: self.assertEqual(open(filename).read(), '') self.assertEqual(gzip.open(filename + '.1').read(), '') self.assertEqual(gzip.open(filename + '.2').read(), FILE_CONTENT)
def test_rotate_NE_RTVsPitsa(self): """ Test horizontal component rotation against PITSA. """ # load test files # no with due to py 2.6 f = gzip.open(os.path.join(self.path, 'rjob_20051006_n.gz')) data_n = np.loadtxt(f) f.close() f = gzip.open(os.path.join(self.path, 'rjob_20051006_e.gz')) data_e = np.loadtxt(f) f.close() #test different angles, one from each sector for angle in [30, 115, 185, 305]: # rotate traces datcorr_r, datcorr_t = rotate_NE_RT(data_n, data_e, angle) # load pitsa files f = gzip.open(os.path.join(self.path, 'rjob_20051006_r_%sdeg.gz' % angle)) data_pitsa_r = np.loadtxt(f) f.close() f = gzip.open(os.path.join(self.path, 'rjob_20051006_t_%sdeg.gz' % angle)) data_pitsa_t = np.loadtxt(f) f.close() # Assert. self.assertTrue(np.allclose(datcorr_r, data_pitsa_r, rtol=1E-3, atol=1E-5)) self.assertTrue(np.allclose(datcorr_t, data_pitsa_t, rtol=1E-3, atol=1E-5))
def write_format(file): record_parser = GenBank.RecordParser(debug_level = 2) print "Testing GenBank writing for %s..." % os.path.basename(file) # be able to handle gzipped files if '.gz' in file: cur_handle = gzip.open(file, "r") compare_handle = gzip.open(file, "r") else: cur_handle = open(file, "r") compare_handle = open(file, "r") iterator = GenBank.Iterator(cur_handle, record_parser) compare_iterator = GenBank.Iterator(compare_handle) while 1: cur_record = iterator.next() compare_record = compare_iterator.next() if cur_record is None or compare_record is None: break # print "\tTesting for %s" % cur_record.version output_record = str(cur_record) + "\n" try: do_comparison(compare_record, output_record) except AssertionError, msg: print "\tTesting for %s" % cur_record.version print msg
def test_seis_sim_vs_pitsa_2(self): """ Test simulate_seismometer seismometer simulation against seismometer simulation of Pitsa - STS-2 seismometer. """ # load test file file = os.path.join(self.path, 'rotz_20081028.gz') with gzip.open(file) as f: data = np.loadtxt(f) # paz of test file samp_rate = 200.0 paz_sts2 = {'poles': [-0.03736 - 0.03617j, -0.03736 + 0.03617j], 'zeros': [0.0 + 0.0j] * 2, 'sensitivity': 1.0, 'gain': 1.5} for id, paz in INSTRUMENTS.items(): # simulate instrument datcorr = simulate_seismometer( data, samp_rate, paz_remove=paz_sts2, paz_simulate=paz, water_level=600.0, zero_mean=False, nfft_pow2=True) # load pitsa file filename = os.path.join(self.path, 'rotz_20081028_%s.gz' % id) with gzip.open(filename) as f: data_pitsa = np.loadtxt(f) # calculate normalized rms rms = np.sqrt(np.sum((datcorr - data_pitsa) ** 2) / np.sum(data_pitsa ** 2)) self.assertTrue(rms < 1e-04)
def test_seis_sim_vs_pitsa1(self): """ Test simulate_seismometer seismometer simulation against seismometer simulation of Pitsa - LE3D seismometer. """ # load test file filename = os.path.join(self.path, 'rjob_20051006.gz') with gzip.open(filename) as f: data = np.loadtxt(f) # paz of test file samp_rate = 200.0 paz_le3d = {'poles': [-4.21 + 4.66j, -4.21 - 4.66j, -2.105 + 0.0j], 'zeros': [0.0 + 0.0j] * 3, 'sensitivity': 1.0, 'gain': 0.4} for id, paz in INSTRUMENTS.items(): # simulate instrument datcorr = simulate_seismometer( data, samp_rate, paz_remove=paz_le3d, paz_simulate=paz, water_level=600.0, zero_mean=False, nfft_pow2=True) # load pitsa file filename = os.path.join(self.path, 'rjob_20051006_%s.gz' % id) with gzip.open(filename) as f: data_pitsa = np.loadtxt(f) # calculate normalized rms rms = np.sqrt(np.sum((datcorr - data_pitsa) ** 2) / np.sum(data_pitsa ** 2)) self.assertTrue(rms < 1.1e-05)
def RebuildCompoundJSON(): kegg_dict = {} for d in json.load(gzip.open(OLD_COMPOUND_JSON_FNAME, 'r')): cid = d['CID'] kegg_dict[cid] = {'compound_id': cid, 'name': d['name'], 'names': d['names'], 'inchi': d['InChI']} # override some of the compounds or add new ones with 'fake' IDs, # i.e. C80000 or higher. for d in csv.DictReader(open(KEGG_ADDITIONS_TSV_FNAME, 'r'), delimiter='\t'): cid = 'C%05d' % int(d['cid']) kegg_dict[cid] = {'compound_id': cid, 'name': d['name'], 'names': [d['name']], 'inchi': d['inchi']} compound_json = [kegg_dict[compound_id] for compound_id in sorted(kegg_dict.keys())] new_json = gzip.open(KEGG_COMPOUND_JSON_FNAME, 'w') json.dump(compound_json, new_json, sort_keys=True, indent=4) new_json.close()
def main(args): mmatch = args.mismatches minleng = args.minlength r1_primers = fasta_to_dict(args.r1primer) r2_primers = fasta_to_dict(args.r2primer) r1_out = gzip.open(get_name(args.r1, "trimmed"), 'wb') r2_out = gzip.open(get_name(args.r2, "trimmed"), 'wb') for i, (r1, r2) in enumerate(izip(readfq(args.r1), readfq(args.r2)), start=1): if i % 100000 == 0: print >>sys.stderr, ">> processed %d reads" % i assert r1.name.split()[0] == r2.name.split()[0] # determine primer being used, trim location p1, r1_left_trim = get_primer(r1.seq, r1_primers, mmatch) p2, r2_left_trim = get_primer(r2.seq, r2_primers, mmatch) if not p1 or not p2: continue # find start of RC of primer in opposing sequence r1_right_trim = trim_loc(r1.seq[r1_left_trim:], rev_comp(r2_primers[p2])) r2_right_trim = trim_loc(r2.seq[r2_left_trim:], rev_comp(r1_primers[p1])) r1.name = "{id}:{cregion}:{fwork} 1".format(id=r1.name.split()[0], cregion=p1, fwork=p2) r2.name = "{id}:{cregion}:{fwork} 2".format(id=r2.name.split()[0], cregion=p1, fwork=p2) # do the trimming of seq and qual r1_full_trim = r1_right_trim + r1_left_trim r1.seq = r1.seq[r1_left_trim:r1_full_trim] r1.qual = r1.qual[r1_left_trim:r1_full_trim] r2_full_trim = r2_right_trim + r2_left_trim r2.seq = r2.seq[r2_left_trim:r2_full_trim] r2.qual = r2.qual[r2_left_trim:r2_full_trim] if len(r1.seq) < minleng or len(r2.seq) < minleng: continue # write the records r1_out.write(r1.__str__() + "\n") r2_out.write(r2.__str__() + "\n")
def test_processFeatDicts(self): """Test that the basic processing of feature dicts works as expected""" ifs = gzip.open(self.FEAT_FILENAME) reader = FeatureDictReader(ifs) instance = NormalizeFeatDicts(); instance.loadFeatKeyToColMap(self.MAP_FILENAME); sMinMaxDict = instance.processFeatureDictList(reader); self.assertEqual(sMinMaxDict['testOther'], self.EXP_TEST_OTHER_MINRANGE); self.assert_(len(sMinMaxDict.keys()) <= len(self.featureList)) #log.info('The MinMaxDict is %s' % pformat(sMinMaxDict)); ifs.close() ifs = gzip.open(self.FEAT_FILENAME) reader = FeatureDictReader(ifs) newFDictList = []; theIdList = []; for idx, fdict in instance.normalizeFeatDictList(reader, sMinMaxDict, self.mapObj): newFDictList.append(fdict) theIdList.append(idx) ifs.close(); self.assertEqual(len(newFDictList), len(self.trdataList)) self.assertEqual(len(theIdList), len(self.trdataList));
def export_compounds(priority, name, ionic_strength, pMg, pH_list): pseudoisomer_fname = DOWNLOADS_PSEUDOISOMER_PREFIX + '_%s.csv.gz' % name csv_pseudoisomers = csv.writer(gzip.open(pseudoisomer_fname, 'w')) csv_pseudoisomers.writerow(["!MiriamID::urn:miriam:kegg.compound", "!Name", "!dG0 (kJ/mol)", "!nH", "!charge", "!nMg", "!Note"]) csv_compound_dict = {} for pH in pH_list: compound_fname = DOWNLOADS_COMPOUND_PREFIX + '_%s_ph%.1f.csv.gz' % (name, pH) csv_compound_dict[pH] = csv.writer(gzip.open(compound_fname, 'w')) csv_compound_dict[pH].writerow(["!MiriamID::urn:miriam:kegg.compound", "!Name", "!dG0_prime (kJ/mol)", "!pH", "!I (mM)", "!T (Kelvin)", "!Note"]) logging.info("Writing chemical and biochemical formation energies for %s to: %s" % (name, pseudoisomer_fname)) for compound in models.Compound.objects.all(): phase = compound.GetDefaultPhaseName() rows = compound.ToCSVdG0(priority, phase=phase) csv_pseudoisomers.writerows(rows) for pH in pH_list: aq_params = conditions.AqueousParams(pH=pH, pMg=pMg, ionic_strength=ionic_strength) rows = compound.ToCSVdG0Prime(priority, aq_params=aq_params, phase=phase) csv_compound_dict[pH].writerows(rows)
def main(options): freq_range=range(options["from"], options["to"]+1) gt_file=gzip.open(options["gt_file"], "r") pos_file=gzip.open(options["pos_file"], "r") out_haps=gzip.open(options["out_root"]+"/haps.gz", "w") out_haps_fn=[gzip.open(options["out_root"]+"/haps.f"+str(x)+".gz", "w") for x in freq_range] out_samples=open(options["out_root"]+"/samples.txt", "w") gt=np.genfromtxt(gt_file, delimiter=1) pos=np.genfromtxt(pos_file) pos=np.floor(pos*options["chr_len"]).astype(int) gt=gt.transpose().astype(int) # This is because on some platforms the np.genfromtxt tries to import the line endings... gt=gt[range(len(pos)),] (nsnp,nind)=gt.shape ACs=np.sum(gt, axis=1) MACs=np.minimum(ACs, nind-ACs) for i in range(nsnp): out_haps.write(("\t".join(["%d"]*(nind+1))+"\n")%((pos[i],)+tuple(gt[i,]))) if MACs[i]>=options["from"] and MACs[i]<= options["to"]: idx=MACs[i]-options["from"] out_haps_fn[idx].write(("\t".join(["%d"]*(nind+1))+"\n")%((pos[i],)+tuple(gt[i,]))) for i in range(int(nind/2)): out_samples.write("SIM%d\n"%(i+1,)) for fil in [gt_file, pos_file, out_haps]+out_haps_fn: fil.close()
def __init__(self): self.ls=[] self.outputBuffer='' #output buffer self.of = gzip.open("final_indx",'wb') #final output remove gzip. for debug mode self.lexf = gzip.open("final_lex",'wb') #final lexicon structure remove gzip. for debug mode self.r=11 #total number of files self.sz=int(math.floor(524288000*2/100/(self.r+1)))
def nwaymerge(s,e,memory,file_prefix,final_file): merge = Merge() f=[] #File pointers list fr=[] #Input read buffers merge.of = gzip.open(final_file+'index','wb') #final output remove gzip. for debug mode merge.lexf = gzip.open(final_file+'lex','wb') #final lexicon structure remove gzip. for debug mode heap_items=[] merge.sz=int(math.floor(memory/(e-s+2))) for i in range(s,e+1): f.append(gzip.open(file_prefix+str(i),'rb')) print("files opened") for i in range(e-s+1): fr.append(merge.readInput(f[i],merge.sz)) merge.constructHeap(fr[i][0].split('\n'),heap_items,i) #put first set in heap appends a marker to last element while len(heap_items)>0: tok,doc_id, hi =heapq.heappop(heap_items) #pop tokens if tok != '': merge.write_final(tok,hi.invitem) if hi.is_last== True: #if last element of file read the next set of data if fr[hi.fl][1]: #checks if file is not empty fr[hi.fl]=merge.readInput(f[hi.fl],merge.sz) merge.constructHeap(fr[hi.fl][0].split('\n'),heap_items,hi.fl) merge.writeLex() #Writes lexicon details to file
def get_highlighted(self, filename, hl_lines=None): """Get the highlighted version of a file.""" hl_lines = sorted(hl_lines or []) st = os.stat(filename) key = '%s-%d-%s-%s' % (filename, int(st.st_mtime), CACHE_SERIAL, hl_lines) key = os.path.join(self.cache_dir, hashlib.sha1(key).hexdigest() + '.html.gz') try: with gzip.open(key) as keyfile: return keyfile.read() except IOError: with open(filename) as infile: file_data = infile.read() try: lexer = lexers.guess_lexer_for_filename(filename, file_data) except pygments.util.ClassNotFound: try: lexer = lexers.guess_lexer(file_data) except pygments.util.ClassNotFound: lexer = lexers.TextLexer() highlight = pygments.highlight( file_data, lexer, formatters.HtmlFormatter( hl_lines=hl_lines, linenos='table', lineanchors='line', anchorlinenos=True)) with gzip.open(key, 'w') as keyfile: keyfile.write(highlight.encode('utf-8')) return highlight
def dat2hdf5(table_dir): """ Convert the Marshall et al. (2006) map from \*.dat.gz to \*.hdf5. """ import astropy.io.ascii as ascii import gzip from contextlib import closing readme_fname = os.path.join(table_dir, 'ReadMe') table_fname = os.path.join(table_dir, 'table1.dat.gz') h5_fname = os.path.join(table_dir, 'marshall.h5') # Extract the gzipped table with gzip.open(table_fname, 'rb') as f: # Read in the table using astropy's CDS table reader r = ascii.get_reader(ascii.Cds, readme=readme_fname) r.data.table_name = 'table1.dat' # Hack to deal with bug in CDS reader. table = r.read(f) print(table) # Reorder table entries according to Galactic (l, b) l = coordinates.Longitude(table['GLON'][:], wrap_angle=180. * units.deg) b = table['GLAT'][:] sort_idx = np.lexsort((b, l)) l = l[sort_idx].astype('f4') b = b[sort_idx].astype('f4') l.shape = (801, 81) b.shape = (801, 81) # Extract arrays from the table chi2_all = np.reshape((table['x2all'][sort_idx]).astype('f4'), (801, 81)) chi2_giants = np.reshape((table['x2gts'][sort_idx]).astype('f4'), (801, 81)) A = np.empty((801 * 81, 33), dtype='f4') sigma_A = np.empty((801 * 81, 33), dtype='f4') dist = np.empty((801 * 81, 33), dtype='f4') sigma_dist = np.empty((801 * 81, 33), dtype='f4') for k in range(33): A[:, k] = table['ext{:d}'.format(k + 1)][sort_idx] sigma_A[:, k] = table['e_ext{:d}'.format(k + 1)][sort_idx] dist[:, k] = table['r{:d}'.format(k + 1)][sort_idx] sigma_dist[:, k] = table['e_r{:d}'.format(k + 1)][sort_idx] A.shape = (801, 81, 33) sigma_A.shape = (801, 81, 33) dist.shape = (801, 81, 33) sigma_dist.shape = (801, 81, 33) # Construct the HDF5 file h5_fname = os.path.join(table_dir, 'marshall.h5') filter_kwargs = dict( chunks=True, compression='gzip', compression_opts=3, # scaleoffset=4 ) with h5py.File(h5_fname, 'w') as f: dset = f.create_dataset('A', data=A, **filter_kwargs) dset.attrs['description'] = 'Extinction of each bin' dset.attrs['band'] = 'Ks (2MASS)' dset.attrs['units'] = 'mag' dset = f.create_dataset('sigma_A', data=sigma_A, **filter_kwargs) dset.attrs['description'] = 'Extinction uncertainty of each bin' dset.attrs['band'] = 'Ks (2MASS)' dset.attrs['units'] = 'mag' dset = f.create_dataset('dist', data=dist, **filter_kwargs) dset.attrs['description'] = 'Distance of each bin' dset.attrs['units'] = 'kpc' dset = f.create_dataset('sigma_dist', data=sigma_dist, **filter_kwargs) dset.attrs['description'] = 'Distance uncertainty of each bin' dset.attrs['units'] = 'kpc' dset = f.create_dataset('chi2_all', data=chi2_all, **filter_kwargs) dset.attrs['description'] = 'Chi^2, based on all the stars' dset.attrs['units'] = 'unitless' dset = f.create_dataset('chi2_giants', data=chi2_giants, **filter_kwargs) dset.attrs['description'] = 'Chi^2, based on giants only' dset.attrs['units'] = 'unitless' # filter_kwargs.pop('scaleoffset') dset = f.create_dataset('l', data=l, **filter_kwargs) dset.attrs['description'] = 'Galactic longitude' dset.attrs['units'] = 'deg' dset = f.create_dataset('b', data=b, **filter_kwargs) dset.attrs['description'] = 'Galactic latitude' dset.attrs['units'] = 'deg'
import numpy as np import gzip # Entzipper für das Auslesen der Testdaten import pickle #Einlesen der Testdaten in einen Array import os.path with gzip.open('mnist.pkl.gz', 'rb') as f: #Öffnet das Trainingsdatenset train_set, valid_set, test_set = pickle.load(f, encoding='iso-8859-1') #train_set[0] ist eine 50.0000 x 784 Matrix --> Pixel Daten #train_set[1] ist eine 50.0000 x 1 Matrix --> Wahre Werte #encoding='iso-8859-1' wird benötigt, da ab python 3 ein anderer #Kodierungsstandard Verwendung findet und das Trainingsdatenset in #Python 2 geschrieben wurde train_x = train_set[0] train_y = train_set[1] #Ziffern in Computerdarstellung test_x = test_set[0] test_y = test_set[1] #überführe Ziffern in Vektoren der passenden Form train_y_dec = np.zeros([len(train_y), 10]) for i in range(len(train_y)): train_y_dec[i][train_y[i]] = 1 test_y_dec = np.zeros([len(test_y), 10]) for i in range(len(test_y)): test_y_dec[i][test_y[i]] = 1 #Sigmoid-Funktion def sig(x):
def _load_data(filename): with gzip.open(filename, 'rb') as f: data = pickle.load(f) return data
def TemporaryGzipInflation(gzfile): t = TemporaryFile(mode='wb') import gzip with gzip.open(gzfile, 'rb') as previewfile: t.write(previewfile.read()) return t.name
#!/usr/bin/env python2.7 from Bio import SeqIO import sys, os, gzip, glob lnfinpat = sys.argv[1:-1] dirout = sys.argv[-1] for nfinpat in lnfinpat: for nfin in glob.glob(nfinpat): if nfin.endswith('.gz'): fin = gzip.open(nfin, 'rb') else: fin = open(nfin, 'r') seqrecit = SeqIO.parse(fin, format='genbank') for seqrec in seqrecit: seqid = seqrec.id nfout = seqid+'.gbk' with open(os.path.join(dirout, nfout), 'w') as fout: SeqIO.write([seqrec], fout, format='genbank') fin.close()
def SignApk(data, keyname, pw, platform_api_level, codename_to_api_level_map, is_compressed): unsigned = tempfile.NamedTemporaryFile() unsigned.write(data) unsigned.flush() if is_compressed: uncompressed = tempfile.NamedTemporaryFile() with gzip.open(unsigned.name, "rb") as in_file, \ open(uncompressed.name, "wb") as out_file: shutil.copyfileobj(in_file, out_file) # Finally, close the "unsigned" file (which is gzip compressed), and then # replace it with the uncompressed version. # # TODO(narayan): All this nastiness can be avoided if python 3.2 is in use, # we could just gzip / gunzip in-memory buffers instead. unsigned.close() unsigned = uncompressed signed = tempfile.NamedTemporaryFile() # For pre-N builds, don't upgrade to SHA-256 JAR signatures based on the APK's # minSdkVersion to avoid increasing incremental OTA update sizes. If an APK # didn't change, we don't want its signature to change due to the switch # from SHA-1 to SHA-256. # By default, APK signer chooses SHA-256 signatures if the APK's minSdkVersion # is 18 or higher. For pre-N builds we disable this mechanism by pretending # that the APK's minSdkVersion is 1. # For N+ builds, we let APK signer rely on the APK's minSdkVersion to # determine whether to use SHA-256. min_api_level = None if platform_api_level > 23: # Let APK signer choose whether to use SHA-1 or SHA-256, based on the APK's # minSdkVersion attribute min_api_level = None else: # Force APK signer to use SHA-1 min_api_level = 1 common.SignFile(unsigned.name, signed.name, keyname, pw, min_api_level=min_api_level, codename_to_api_level_map=codename_to_api_level_map) data = None if is_compressed: # Recompress the file after it has been signed. compressed = tempfile.NamedTemporaryFile() with open(signed.name, "rb") as in_file, \ gzip.open(compressed.name, "wb") as out_file: shutil.copyfileobj(in_file, out_file) data = compressed.read() compressed.close() else: data = signed.read() unsigned.close() signed.close() return data
def load_DicFromPickleFile(pickleFilePath): pf = gzip.open(pickleFilePath,'rb') retVal = cPickle.load(pf) pf.close() return retVal
def create_PickleFromDict(pickleFilePath, dic): pf = gzip.open(pickleFilePath,'wb') cPickle.dump(dic,pf) pf.close()
def compress(file_name): '''Como métodos estáticos não acessam classe nem instância, o Python não dá a eles nenhum primeiro parâmetro''' with open(file_name, 'rb') as content: with gzip.open(file_name + '.gz', 'wb') as gzip_file: gzip_file.writelines(content)
def fopen(filename, mode='r'): if filename.endswith('.gz'): return gzip.open(filename, mode) return open(filename, mode)
def parse(path): g = gzip.open(path, 'rb') for l in g: yield eval(l)
GQThreshold = args.min_gq logger.info('Max parental variant fraction: %f', MAX_PARENTAL_VAR_FRAC) logger.info('Min variant fraction in child: %f', MIN_VAR_FRAC_IN_CHILD) logger.info('Min reads in child: %d', MIN_READS_IN_CHILD) logger.info('logBayesFactorThreshold: %f', logBayesFactorThreshold) logger.info('GQ threshold: %d', GQThreshold) # Input VCF file of Platypus (or other) calls. Can be gzipped or # plain text. inVCFName = args.vcf inVCFFile = None logger.info('Using VCF: %s', inVCFName) if inVCFName.endswith("gz"): inVCFFile = gzip.open(inVCFName, 'r') else: inVCFFile = open(inVCFName, 'r') pedFileName = args.ped # Name of pedigree file logger.info('Using PED: %s', pedFileName) extension = args.extension # Make output files using the name of the input VCF with various extensions outMendelErrorsFileName = inVCFName.split(".")[0] + "_mendelErrors%s.vcf" %(extension) outDeNovoVarsFileName = inVCFName.split(".")[0] + "_deNovoVariants%s.vcf" %(extension) outFilteredDeNovoVarsFileName = inVCFName.split(".")[0] + "_deNovoVariantsPassingBayesianFilter%s.vcf" %(extension) outMendelErrorFile = open(outMendelErrorsFileName, 'w') # Output file to contain list of mendelian inconsistency calls outDeNovoVarsFile = open(outDeNovoVarsFileName, 'w') # Output file to contain list of de novo variants
ax.set_xticks([]) ax.set_yticks([]) plt.show() show_all_digit_components(X, 0) #%% """MNIST Dataset""" import gzip, pickle DATA_PATH = 'data/mnist.pkl.gz' with gzip.open(DATA_PATH, 'rb') as f: (X, y), _, _ = pickle.load(f, encoding='latin1') # As a sanity check, we print out the size of the data. print('Training data shape: ', X.shape) print('Training labels shape: ', y.shape) plt.figure(figsize=(12, 6)) pca = PCA().fit(X) # Notice plt.bar(range(200), pca.explained_variance_ratio_[:200], alpha=0.8, align='center')
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) # Runs the op. print(sess.run(c)) tf.set_random_seed(123) from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif from sklearn.svm import SVC from sklearn.metrics import accuracy_score from sklearn.ensemble import RandomForestClassifier #hello = tf.constant('Hello, TensorFlow!') #sess = tf.Session() #print(sess.run(hello)) import gzip f = gzip.open('mnist.pkl.gz', 'rb') if sys.version_info < (3,): data = pickle.load(f) else: data = pickle.load(f, encoding='bytes') f.close() (feat_train,l_train),(feat_test,l_test) = data feat_train = feat_train.astype('float32') feat_test = feat_test.astype('float32') feat_train/=255 feat_test/=255 pca_train = feat_train[:,:,:] pca_train=pca_train.reshape(pca_train.shape[0],28,28,1) print(pca_train.shape) npad = ((0, 0), (2, 2), (2, 2))
cdata['series'][sname].append(None) else: iseries.sort() if len(iseries) % 2 == 1: median = iseries[(len(iseries) - 1) / 2] else: median = iseries[len(iseries) / 2] if iseries[0] == median: cdata['series'][sname].append(median) else: cdata['series'][sname].append([iseries[0], median, iseries[-1]]) return cdata for fname in files: print("Condensing %s" % (fname,)) f = gzip.open(os.path.join(outdir, fname), 'r') fdata = json.loads(f.read()) f.close() if not len(fdata['builds']): continue totaldata['allseries'].append({ 'fromtime' : fdata['builds'][0]['time'], 'totime' : fdata['builds'][-1]['time'], 'dataname' : fname.replace('.json.gz', '') }) cdata = condense_data(fdata) for x in cdata['series'].keys(): totaldata['series'].setdefault(x, []) # If this series just appeared, or was absent from some datafiles before this, # make sure we pad out with nulls to keep the indexes lined up totaldata['series'][x].extend([None for y in range(len(totaldata['builds']) - len(totaldata['series'][x]))]) totaldata['series'][x].extend(cdata['series'][x])
def main(): # admGlobMod = 'Ukraine*' if '-demo' in sys.argv else ''; # try: con = psycopg2.connect("dbname='postgres' user='' host='' password=''") except: con = psycopg2.connect("dbname='' user='' host='localhost' password=''") cur = con.cursor(); con.autocommit = True initTable = 'DROP TABLE IF EXISTS new_masked_ndvi;' if '-drop' in sys.argv else ''; print initTable cur.execute('%sCREATE TABLE IF NOT EXISTS new_masked_ndvi (region_id text, country text, state text, district text, start_date date, ndvi real, ndvi_count integer, anomaly real, anomaly_count integer, centr_lon real, centr_lat real, PRIMARY KEY (centr_lon, centr_lat, start_date));'%(initTable)) path = '/database/gimms.gsfc.nasa.gov/'; tmp = '/dev/shm/'; files = defaultdict(str) admShp2 = glob2.glob('/database/gimms.gsfc.nasa.gov/shapefiles/admin2_*%s.shp'%(admGlobMod)) if '-update' in sys.argv: # Select the oldest of all countries' last-updates cur.execute('SELECT DISTINCT country, MAX(start_date) FROM new_masked_ndvi GROUP BY country ORDER BY MAX(start_date) LIMIT 1') updateStart = cur.fetchone()[1] + datetime.timedelta(days=8) updateStartDay = updateStart.timetuple().tm_yday baseURL = "wget -r -nv -N -c -t 3 -R '*5v3*' -R '*DOY.tif*' ftp://gimms.gsfc.nasa.gov/MODIS/std/GMYD09Q1/tif/NDVI" os.chdir('/database/') for yrStart in xrange(updateStart.year, datetime.date.today().year + 1): dayOfYrStart = updateStartDay if yrStart == updateStart.year else 1 for dayOfYr in xrange(dayOfYrStart, 365, 8): #This 'for' loop downloads both the NDVI and Anomaly files per year/day-of-year pair. for pathSuffix in ['/%s/%s*'%(yrStart,dayOfYr),'_anom_S2003-2015/%s/%s*'%(yrStart,dayOfYr)]: os.system(baseURL + pathSuffix) for fp in glob2.glob(path + '*/**/*.tif.gz'): files[os.path.split(fp)[1]] = os.path.split(fp)[0] def freeSpace(): freeSpace = os.statvfs('/dev/shm') # Clear tmp space if (freeSpace[2] - freeSpace[3]) * 1.125 > freeSpace[2]: delFiles={} for fl in glob2.glob('/dev/shm/*.tif'): delFiles[str(os.path.getctime(fl))] = fl delFileKeys = sorted(delFiles.keys()) print '\n\t*\tFree space: %s, Used space: %s\t*\n'%(freeSpace[3], freeSpace[2] - freeSpace[3]) print '\t\t\tDeleting %s files, starting with %s\n'%(int(len(delFileKeys) / 1.125), delFiles[delFileKeys[0]]) for dF in xrange(int(len(delFileKeys) / 1.125)): os.remove(delFiles[delFileKeys[dF]]) def getSt(shp,f,xy): shp=shp; f=f; xy=xy; x=int(xy[1:3]); y=int(xy[4:6]) # Just get the data (raster_out) - calculating stats further down with rasterio.open(f) as src: affine = src.affine dat = src.read(1) xyMask = mask[y*4000:y*4000+4000,x*4000:x*4000+4000] statsMask = tuple(xmea for xmea in r.zonal_stats(shp, xyMask, stats='count', geojson_out=True, raster_out=True, affine=affine, nodata=None) if floor((180 + xmea['properties']['mini_raster_affine'][2]) / 9) == int(xy[1:3]) and floor((90 - xmea['properties']['mini_raster_affine'][5]) / 9) == int(xy[4:6])) stats = tuple(mea for mea in r.zonal_stats(shp, dat, stats='count', geojson_out=True, raster_out=True, affine=affine, nodata=None) if floor((180 + mea['properties']['mini_raster_affine'][2]) / 9) == int(xy[1:3]) and floor((90 - mea['properties']['mini_raster_affine'][5]) / 9) == int(xy[4:6])) return [stats, statsMask] def getSQLRecords(): cur.execute('SELECT DISTINCT country FROM new_masked_ndvi') return cur.fetchall() if '-demo' not in sys.argv and '-update' not in sys.argv: # IF THIS IS NOT DEMO MODE OR UPDATE MODE... records = [recor[0].replace('\xc3\x83\xc2\x85','').lower() for recor in getSQLRecords()] # COUNTRY FILTER # for filter in ('antarctica','greenland'): records.append(filter) # for filter in ('antarctica','greenland'): records.append(filter) else: records = [] # Open the mask file print 'Opening mask file...' mask = gdal.Open('/database/gimms.gsfc.nasa.gov/mask.tif') print 'Mask file opened'; sys.stdout.flush() mask = np.array(mask.GetRasterBand(1).ReadAsArray()) print 'Mask read into array'; sys.stdout.flush() enALS = 2 for adLevShp in [admShp2]: #This is really just a single-element array since we're only doing Admin Level 2 files. for shapeFile in adLevShp: lcShapeFile = shapeFile.decode('utf-8').lower()[0:-4] + str(enALS); continu = False for rec in records: continu = True if rec in lcShapeFile else continu if continu is True: continue # Skip this shapefile -=- its records are in the database countryDateAnom = defaultdict(dict); countryDateNDVI = defaultdict(dict) print '\nShapefile: %s'%(shapeFile); sys.stdout.flush() with fiona.open(shapeFile) as shp: # Country ident value comes from either the ISO or ISO2 code. name0 = 'NAME_ENGLI'; name1 = 'NAME_ENGLI'; name2 = 'NAME_ENGLI'; ident = 'ISO2' try: shp[0]['properties'][name0] except: name0 = 'NAME_0'; name1 = 'NAME_1'; name2 = 'NAME_1'; ident = 'HASC_1' try: shp[0]['properties']['NAME_2']; name2 = 'NAME_2'; ident = 'HASC_2' except: pass xyList = []; for x in xrange(int(floor((180 + shp.bounds[0]) / 9)), int(floor((180 + shp.bounds[2]) / 9)) + 1): for y in xrange(int(floor((90 - shp.bounds[3]) / 9)), int(floor((90 - shp.bounds[1]) / 9)) + 1): if 1 in mask[y*4000:y*4000+4000,x*4000:x*4000+4000]: xyList.append('x%02dy%02d'%(x,y)) else: print 'x%02dy%02d removed due to mask'%(x,y) if '-update' in sys.argv: yrStart = updateStart.year # Reset for use further down. dayOfYrStart = updateStartDay else: dayOfYrStart = 1 yrStart = 2002 # for doy in xrange(209,265,8): for doy in xrange(dayOfYrStart,365,8): for xy in xyList: print '\nXY:%s'%(xy); sys.stdout.flush() fdoy = '%03d'%(doy) print '\n %s :: '%(fdoy),; sys.stdout.flush() for yr in xrange(yrStart,datetime.datetime.now().year + 1): anFl = 'GMYD09Q1.A%s%s.08d.latlon.%s.6v1.NDVI_anom_S2003-2015.tif.gz'%(yr,fdoy,xy) ndFl = 'GMYD09Q1.A%s%s.08d.latlon.%s.6v1.NDVI.tif.gz'%(yr,fdoy,xy) if anFl in files.keys() and ndFl in files.keys(): aF = tmp + anFl[0:-3] # Anomaly .tif file if not os.path.isfile(aF): try: with gzip.open(os.path.join(files[anFl],anFl),'rb') as aZ: with open(aF,'w+b') as aT: aT.write(aZ.read()) except Exception as gzAf: print 'Could not extract', gzAf continue try: aStatsX = getSt(shp,aF,xy) aStats = aStatsX[0] aMaskStats = aStatsX[1] if aStats == (): print '! No Data, Skipping this year for this xy for this doy !' continue except Exception as aStErr: print '\tCould not get stats', aStErr continue print '%s'%(str(yr)[-2:]),; sys.stdout.flush() for aStI,aSt in enumerate(aStats): #For each Admin Level 2 #area in the current country (shp file): lonLatCentroid = (aSt['properties'] ['mini_raster_affine'][2], aSt['properties'] ['mini_raster_affine'][5]) try: aSt['properties'][name0] except: continue names = '%s:%s:%s'%(aSt['properties'][name0], aSt['properties'][name1], aSt['properties'][name2]) namesDay = '%s:%s'%(names,fdoy) namesDayYear = '%s:%s'%(namesDay,yr) regionID = aSt['properties'][ident] if regionID is None or regionID == u'': regionID = aSt['properties']['ISO'] aMask = [~np.bool_(aMaskStats[aStI] ['properties']['mini_raster_array']).flatten()] aData = ma.masked_array(np.float32 (aSt['properties']['mini_raster_array']) .flatten(), aMask) anom = (ma.masked_outside(aData,0,250).compressed() - 125) * .008 try: countryDateAnom[namesDayYear]['level'] = enALS countryDateAnom[namesDayYear]['rId'] = regionID countryDateAnom[namesDayYear]['lonlatcentroid'] = lonLatCentroid countryDateAnom[namesDayYear]['anom'] = (countryDateAnom[namesDayYear]['anom'] + np.mean(anom)) / 2 if 'anom' in countryDateAnom[namesDayYear].keys() else np.mean(anom) countryDateAnom[namesDayYear]['anomcount'] = countryDateAnom[namesDayYear]['anomcount'] + len(anom) if 'anomcount' in countryDateAnom[namesDayYear].keys() else len(anom) except: continue
faker = Faker() # timestr = datetime.datetime.strftime(datetime.datetime.now() - datetime.timedelta(30), "%Y%m%d-%H%M%S") timestr = time.strftime("%Y%m%d-%H%M%S") otime = datetime.datetime.now() outFileName = 'orders_log_' + timestr + '.log' if not file_prefix else file_prefix + '_access_log_' + timestr + '.log' for case in switch(output_type): if case('LOG'): f = open(outFileName, 'w') break if case('GZ'): f = gzip.open(outFileName + '.gz', 'w') break if case('CONSOLE'): pass if case(): f = sys.stdout response = ["200", "404", "500", "301"] verb = ["GET", "POST"] # resources = ["/user/register", "/user/login", "/wp-content", "/wp-admin", "/explore", "/search/tag/list", # "/app/main/posts", # "/posts/posts/explore", "/apps/cart.jsp?appID="] resources = ["/app/cart " , "/app/orders "] ualist = [faker.firefox, faker.chrome, faker.safari, faker.internet_explorer, faker.opera]
def iterator(self): self.ver = None # File version self.obj = None # Problem structure self.mapnum = 0 self.mapstacknum = 0 self.mapstackdim = list() self.mapstackdomain = list() self.varnum = 0 self.varstacknum = 0 self.varstackdim = list() self.varstackdomain = list() self.intvarnum = 0 self.intvar = list() self.psdmapnum = 0 self.psdmapdim = list() self.psdvarnum = 0 self.psdvardim = list() self.objfnnz = 0 # Objective coefficients self.objfsubj = list() self.objfsubk = list() self.objfsubl = list() self.objfval = list() self.objannz = 0 self.objasubj = list() self.objaval = list() self.objbval = 0 self.fnnz = 0 # Scalar map coefficients self.fsubi = list() self.fsubj = list() self.fsubk = list() self.fsubl = list() self.fval = list() self.annz = 0 self.asubi = list() self.asubj = list() self.aval = list() self.bnnz = 0 self.bsubi = list() self.bval = list() self.hnnz = 0 # PSD map coefficients self.hsubi = list() self.hsubj = list() self.hsubk = list() self.hsubl = list() self.hval = list() self.dnnz = 0 self.dsubi = list() self.dsubk = list() self.dsubl = list() self.dval = list() self.change = False self.simplebounds = False simplemapvaridx = list() simplemapsign = list() simplemapconst = list() keyset = 0 keyquery = self.fullkeyquery.copy() (linenum, line) = (-1, "") [self.name, filetype] = os.path.splitext(os.path.basename(self.file)) if filetype.lower() == '.gz': self.name = os.path.splitext(self.name)[0] ff = gzip.open(self.file, 'rt') else: ff = open(self.file, 'rt') f = enumerate(ff) try: for (linenum, line) in f: line = self.__prepare_line(line) # Ignore comments between blocks if line.startswith('#'): continue # Ignore empty lines between blocks if not line: continue # Stop when requested information has been gathered if len(keyquery) == 0: break # # Keyword set: File description keywords # if keyset == 0: if line == "VER": (linenum, line) = next(f) self.ver = int(self.__prepare_line(line)) keyquery.discard("VER") keyquery.discard("VER:HEAD") continue # Unrecognized line. Going to next set of keywords. if line in self.keywords: keyset = self.__inc_keyset(keyset) keyquery -= self.keywordqueryset[keyset-1] else: raise Exception('Keyword not recognized') # # Keyword set: Structural keywords (note the default values) # if keyset == 1: if line == "OBJSENSE": if self.obj is not None: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) self.obj = self.__prepare_line(line) keyquery.discard("OBJSENSE") keyquery.discard("OBJSENSE:HEAD") continue if line == "PSDVAR": if self.psdvarnum > 0: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) self.psdvarnum = int(self.__prepare_line(line)) if "PSDVAR" in keyquery: self.psdvardim = [0]*self.psdvarnum for i in range(self.psdvarnum): (linenum, line) = next(f) self.psdvardim[i] = int( self.__prepare_line(line)) elif not keyquery <= set(["PSDVAR:HEAD"]): for i in range(self.psdvarnum): next(f) keyquery.discard("PSDVAR") keyquery.discard("PSDVAR:HEAD") continue if line == "VAR": if self.varnum > 0: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) buf = self.__prepare_line(line).split(' ') self.varnum = int(buf[0]) self.varstacknum = int(buf[1]) if "VAR" in keyquery: self.varstackdomain = ['']*self.varstacknum self.varstackdim = [0]*self.varstacknum for i in range(self.varstacknum): (linenum, line) = next(f) buf = self.__prepare_line(line).split(' ') self.varstackdomain[i] = buf[0] self.varstackdim[i] = int(buf[1]) elif not keyquery <= set(["VAR:HEAD"]): for i in range(self.varstacknum): next(f) keyquery.discard("VAR") keyquery.discard("VAR:HEAD") continue if line == "INT": if self.intvarnum > 0: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) self.intvarnum = int(self.__prepare_line(line)) if "INT" in keyquery: self.intvar = [0]*self.intvarnum for i in range(self.intvarnum): (linenum, line) = next(f) self.intvar[i] = int(self.__prepare_line(line)) elif not keyquery <= set(["INT:HEAD"]): for i in range(self.intvarnum): next(f) keyquery.discard("INT") keyquery.discard("INT:HEAD") continue if line == "PSDCON": if self.psdmapnum > 0: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) self.psdmapnum = int(self.__prepare_line(line)) if "PSDCON" in keyquery: self.psdmapdim = [0]*self.psdmapnum for i in range(self.psdmapnum): (linenum, line) = next(f) self.psdmapdim[i] = int( self.__prepare_line(line)) elif not keyquery <= set(["PSDCON:HEAD"]): for i in range(self.psdmapnum): next(f) keyquery.discard("PSDCON") keyquery.discard("PSDCON:HEAD") continue if line == "CON": if self.mapnum > 0: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) buf = self.__prepare_line(line).split(' ') self.mapnum = int(buf[0]) self.mapstacknum = int(buf[1]) if "CON" in keyquery: self.mapstackdomain = ['']*self.mapstacknum self.mapstackdim = [0]*self.mapstacknum for i in range(self.mapstacknum): (linenum, line) = next(f) buf = self.__prepare_line(line).split(' ') self.mapstackdomain[i] = buf[0] self.mapstackdim[i] = int(buf[1]) elif not keyquery <= set(["CON:HEAD"]): for i in range(self.mapstacknum): next(f) keyquery.discard("CON") keyquery.discard("CON:HEAD") continue # Unrecognized line. Going to next set of keywords. if line in self.keywords: keyset = self.__inc_keyset(keyset) keyquery = self.__resolve_keyquery_logic(keyquery) keyquery -= self.keywordqueryset[keyset-1] self.simplebounds = (all(x in self.fullkeyquery for x in ['VAR', 'CON']) and (not self.mapnum or 'BCOORD' in keyquery) and (not self.mapnum or not self.varnum or 'ACOORD' in keyquery) and (not self.mapnum or not self.psdvarnum or 'FCOORD' in keyquery)) if self.simplebounds: simplemapvaridx = [-1]*self.mapnum simplemapsign = [1.0]*self.mapnum simplemapconst = [0.0]*self.mapnum if len(keyquery) == 0: break else: raise Exception('Keyword not recognized') # # Keyword set: Data keywords # if keyset == 2: if line == "OBJFCOORD": if not self.change and self.objfnnz != 0.0: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) curnnz = int(self.__prepare_line(line)) if "OBJFCOORD" in keyquery: self.objfsubj += [0]*curnnz self.objfsubk += [0]*curnnz self.objfsubl += [0]*curnnz self.objfval += [0.0]*curnnz for i in range(self.objfnnz, self.objfnnz + curnnz): (linenum, line) = next(f) buf = self.__prepare_line(line).split(' ') self.objfsubj[i] = int(buf[0]) self.objfsubk[i] = int(buf[1]) self.objfsubl[i] = int(buf[2]) self.objfval[i] = float(buf[3]) elif not keyquery <= set(["OBJFCOORD:HEAD"]): for i in range(self.objfnnz, self.objfnnz + curnnz): next(f) self.objfnnz += curnnz keyquery.discard("OBJFCOORD") keyquery.discard("OBJFCOORD:HEAD") continue if line == "OBJACOORD": if not self.change and self.objannz != 0.0: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) curnnz = int(self.__prepare_line(line)) if "OBJACOORD" in keyquery: self.objasubj += [0]*curnnz self.objaval += [0.0]*curnnz for i in range(self.objannz, self.objannz + curnnz): (linenum, line) = next(f) buf = self.__prepare_line(line).split(' ') self.objasubj[i] = int(buf[0]) self.objaval[i] = float(buf[1]) elif not keyquery <= set(["OBJACOORD:HEAD"]): for i in range(self.objannz, self.objannz + curnnz): next(f) self.objannz += curnnz keyquery.discard("OBJACOORD") keyquery.discard("OBJACOORD:HEAD") continue if line == "OBJBCOORD": if not self.change and self.objbval != 0.0: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) self.objbval = float(self.__prepare_line(line)) keyquery.discard("OBJBCOORD") keyquery.discard("OBJBCOORD:HEAD") continue if line == "FCOORD": if not self.change and self.fnnz > 0: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) curnnz = int(self.__prepare_line(line)) if "FCOORD" in keyquery: self.fsubi += [0]*curnnz self.fsubj += [0]*curnnz self.fsubk += [0]*curnnz self.fsubl += [0]*curnnz self.fval += [0.0]*curnnz for i in range(self.fnnz, self.fnnz + curnnz): (linenum, line) = next(f) buf = self.__prepare_line(line).split(' ') self.fsubi[i] = int(buf[0]) self.fsubj[i] = int(buf[1]) self.fsubk[i] = int(buf[2]) self.fsubl[i] = int(buf[3]) self.fval[i] = float(buf[4]) if self.simplebounds: simplemapvaridx[self.fsubi[i]] = -2 elif not keyquery <= set(["FCOORD:HEAD"]): for i in range(self.fnnz, self.fnnz + curnnz): next(f) self.fnnz += curnnz keyquery.discard("FCOORD") keyquery.discard("FCOORD:HEAD") continue if line == "ACOORD": if not self.change and self.annz > 0: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) curnnz = int(self.__prepare_line(line)) if "ACOORD" in keyquery: self.asubi += [0]*curnnz self.asubj += [0]*curnnz self.aval += [0.0]*curnnz for i in range(self.annz, self.annz + curnnz): (linenum, line) = next(f) buf = self.__prepare_line(line).split(' ') self.asubi[i] = int(buf[0]) self.asubj[i] = int(buf[1]) self.aval[i] = float(buf[2]) if self.simplebounds: if abs(self.aval[i]) == 1.0 and simplemapvaridx[self.asubi[i]] == -1: simplemapvaridx[self.asubi[i] ] = self.asubj[i] simplemapsign[self.asubi[i] ] = self.aval[i] else: simplemapvaridx[self.asubi[i]] = -2 elif not keyquery <= set(["ACOORD:HEAD"]): for i in range(self.annz, self.annz + curnnz): next(f) self.annz += curnnz keyquery.discard("ACOORD") keyquery.discard("ACOORD:HEAD") continue if line == "BCOORD": if not self.change and self.bnnz > 0: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) curnnz = int(self.__prepare_line(line)) if "BCOORD" in keyquery: self.bsubi += [0]*curnnz self.bval += [0.0]*curnnz for i in range(self.bnnz, self.bnnz + curnnz): (linenum, line) = next(f) buf = self.__prepare_line(line).split(' ') self.bsubi[i] = int(buf[0]) self.bval[i] = float(buf[1]) if self.simplebounds: simplemapconst[self.bsubi[i] ] = self.bval[i] elif not keyquery <= set(["BCOORD:HEAD"]): for i in range(self.bnnz, self.bnnz + curnnz): next(f) self.bnnz += curnnz keyquery.discard("BCOORD") keyquery.discard("BCOORD:HEAD") continue if line == "HCOORD": if not self.change and self.hnnz > 0: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) curnnz = int(self.__prepare_line(line)) if "HCOORD" in keyquery: self.hsubi += [0]*curnnz self.hsubj += [0]*curnnz self.hsubk += [0]*curnnz self.hsubl += [0]*curnnz self.hval += [0.0]*curnnz for i in range(self.hnnz, self.hnnz + curnnz): (linenum, line) = next(f) buf = self.__prepare_line(line).split(' ') self.hsubi[i] = int(buf[0]) self.hsubj[i] = int(buf[1]) self.hsubk[i] = int(buf[2]) self.hsubl[i] = int(buf[3]) self.hval[i] = float(buf[4]) elif not keyquery <= set(["HCOORD:HEAD"]): for i in range(self.hnnz, self.hnnz + curnnz): next(f) self.hnnz += curnnz keyquery.discard("HCOORD") keyquery.discard("HCOORD:HEAD") continue if line == "DCOORD": if not self.change and self.dnnz > 0: raise Exception( 'Keyword also found earlier and can only appear once') (linenum, line) = next(f) curnnz = int(self.__prepare_line(line)) if "DCOORD" in keyquery: self.dsubi += [0]*curnnz self.dsubk += [0]*curnnz self.dsubl += [0]*curnnz self.dval += [0.0]*curnnz for i in range(self.dnnz, self.dnnz + curnnz): (linenum, line) = next(f) buf = self.__prepare_line(line).split(' ') self.dsubi[i] = int(buf[0]) self.dsubk[i] = int(buf[1]) self.dsubl[i] = int(buf[2]) self.dval[i] = float(buf[3]) elif not keyquery <= set(["DCOORD:HEAD"]): for i in range(self.dnnz, self.dnnz + curnnz): next(f) self.dnnz += curnnz keyquery.discard("DCOORD") keyquery.discard("DCOORD:HEAD") continue if line == "CHANGE": self.change = True self.__missing_keyword_scan(keyset) # Stop at current state of variables yield self keyset = 2 keyquery = self.fullkeyquery & ( self.keywordqueryset[2] | set([None])) continue raise Exception('Keyword not recognized') # # End of file reached at this point # (linenum, line) = (linenum+1, "") if len(keyquery) != 0: self.__missing_keyword_scan(keyset) # Compute variable bounds when information is available if self.simplebounds: self.blx = [float("-inf")] * self.varnum self.bux = [float("+inf")] * self.varnum j = -1 for k in range(self.varstacknum): for km in range(self.varstackdim[k]): j = j + 1 if self.varstackdomain[k] in ['L=', 'L+'] or (self.varstackdomain[k] == 'Q' and km <= 1) or (self.varstackdomain[k] == 'QR' and km <= 2): self.blx[j] = max(self.blx[j], 0.0) if self.varstackdomain[k] in ['L=', 'L-']: self.bux[j] = min(self.bux[j], 0.0) i = -1 for k in range(self.mapstacknum): for km in range(self.mapstackdim[k]): i = i + 1 j = simplemapvaridx[i] if j >= 0: if self.mapstackdomain[k] in ['L=', 'L+'] or (self.mapstackdomain[k] == 'Q' and km <= 1) or (self.mapstackdomain[k] == 'QR' and km <= 2): if simplemapsign[i] > 0: self.blx[j] = max( self.blx[j], -simplemapconst[i]*simplemapsign[i]) else: self.bux[j] = min( self.bux[j], -simplemapconst[i]*simplemapsign[i]) if self.mapstackdomain[k] in ['L=', 'L-']: if simplemapsign[i] > 0: self.bux[j] = min( self.bux[j], -simplemapconst[i]*simplemapsign[i]) else: self.blx[j] = max( self.blx[j], -simplemapconst[i]*simplemapsign[i]) # Stop at current state of variables yield self except Exception as e: if isinstance(e, StopIteration): msg = 'Unexpected end of file' else: msg = str(e) raise Exception(''.join([ msg, '. File: ', self.file, '\n', str(linenum+1), ': ', line, '\n'])) finally: ff.close()
test_list.append({ 'label': line[0][1:-1], 'title': line[1][1:-1], 'description': line[2][1:-1] }) # indices train_index = list(range(len(train_list))) test_index = list(range(len(train_list), len(train_list) + len(test_list))) index = { 'train': train_index, 'test': test_index } assert len(set(index['train']).intersection(index['test'])) == 0 with gzip.open('index.json.gz', mode='wt') as file: json.dump(index, file) all_list = train_list all_list.extend(test_list) with gzip.open('data.json.gz', mode='wt') as file: json.dump(all_list, file) # test with gzip.open('data.json.gz', mode='rt') as file: data_list = json.load(file) with gzip.open('index.json.gz', 'rt') as file: index_dict = json.load(file) assert len(set(index['train']).intersection(index['test'])) == 0
import gzip import argparse from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE,SIG_DFL) #st.norm.ppf(q, loc=0, scale=1) parser = argparse.ArgumentParser(description = "Convert QTLtools output into format suitable for fgwas.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--qtltools", help = "Sorted QTLtools output file.") parser.add_argument("--perm", help = "QTLtools output from the permutation run.") parser.add_argument("--annot", help = "Variant annotations for fgwas (sorted by positition)") parser.add_argument("--N", help = "QTL sample size.") args = parser.parse_args() #Set up input files qtltools_file = gzip.open(args.qtltools, 'r') fgwas_file = gzip.open(args.annot,'r') perm_file = gzip.open(args.perm,'r') n_samples = args.N #Make a directory of phenotypes to be included in the fgwas output phenotype_dict = dict() for line in perm_file: line = line.decode("utf8").rstrip() fields = line.split() phenotype_id = fields[5] phenotype_dict[phenotype_id] = 1 perm_file.close() #Make full header header = "SNPID CHR POS Z F N SEGNUMBER"
anom = (ma.masked_outside(aData,0,250).compressed() - 125) * .008 try: countryDateAnom[namesDayYear]['level'] = enALS countryDateAnom[namesDayYear]['rId'] = regionID countryDateAnom[namesDayYear]['lonlatcentroid'] = lonLatCentroid countryDateAnom[namesDayYear]['anom'] = (countryDateAnom[namesDayYear]['anom'] + np.mean(anom)) / 2 if 'anom' in countryDateAnom[namesDayYear].keys() else np.mean(anom) countryDateAnom[namesDayYear]['anomcount'] = countryDateAnom[namesDayYear]['anomcount'] + len(anom) if 'anomcount' in countryDateAnom[namesDayYear].keys() else len(anom) except: continue # End anomaly loop nF = tmp + ndFl[0:-3] # NDIV .tif file if not os.path.isfile(nF): freeSpace() try: with gzip.open(os.path.join(files[ndFl],ndFl),'rb') as nZ: with open(nF,'w+b') as nT: nT.write(nZ.read()) except Exception as gzNf: print 'Could not extract', gzNf continue try: nStatsX = getSt(shp,nF,xy) nStats = nStatsX[0] nMaskStats = nStatsX[1] if nStats == (): print '! No Data, Skipping this grid point !' continue # Will skip to the next year for this day/xy except Exception as nStErr:
def gunzip_file(fname_in, fname_out): with gzip.open(fname_in, 'rb') as f_in: with open(fname_out, 'wb') as f_out: copyfileobj(f_in, f_out)
return(name.replace('/', '_')) files = repo_file_list(bones = False) spectracount = 0 photocount = 0 eventswithspectra = 0 eventswithphoto = 0 for fcnt, eventfile in enumerate(tqdm(sorted(files, key=lambda s: s.lower()))): #if fcnt > 100: # break fileeventname = os.path.splitext(os.path.basename(eventfile))[0].replace('.json','') if eventfile.split('.')[-1] == 'gz': with gzip.open(eventfile, 'rt') as f: filetext = f.read() else: with open(eventfile, 'r') as f: filetext = f.read() item = json.loads(filetext, object_pairs_hook=OrderedDict) namekey = list(item.keys())[0] item = item[namekey] if namekey != item['name']: tqdm.write(namekey + ' has different name from its key ' + item['name']) if 'spectra' in item: eventswithspectra += 1 spectracount += len(item['spectra'])
def _iterate(self): """iterate over muliple files.""" def _iter(infile): identifier = None for line in infile: if line.startswith("#"): continue if line.startswith(">"): if self.regexIdentifier: try: identifier = re.search(self.regexIdentifier, line[1:-1]).groups()[0] except AttributeError: raise ValueError( "could not parse identifier from line %s - check the input" % line[1:-1]) else: identifier = re.split("\s", line[1:-1])[0] else: if not identifier: raise ValueError( "refusing to emit sequence without identifier - check the input" ) yield identifier, line.strip() for filename in self.filenames: if self.format == "tar.gz" or self.format == "tar" or ( self.format == "auto" and filename.endswith("tar.gz")): if filename == "-": tf = tarfile.open(fileobj=sys.stdin, mode="r|*") else: tf = tarfile.open(filename, mode="r") for f in tf: b, ext = os.path.splitext(f.name) if ext.lower() in (".fasta", ".fa"): E.info("extracting %s" % f.name) infile = tf.extractfile(f) for x in _iter(infile): yield x else: E.info("skipping %s" % f.name) if tf != sys.stdin: tf.close() continue elif self.format == "fasta.gz" or (self.format == "auto" and filename.endswith(".gz")): infile = gzip.open(filename, "r") elif filename == "-": infile = sys.stdin else: infile = open(filename, "r") for x in _iter(infile): yield x if filename != "-": infile.close() raise StopIteration
def parse_meta(path): with gzip.open(path, 'rt') as gz: for line in gz: yield literal_eval(line)
def load_object(fname): return cPickle.load(gzip.open(fname, "rb"))
def lambda_handler(event, context): # Attribute bucket and file name/path to variables bucket = event['Records'][0]['s3']['bucket']['name'] key = event['Records'][0]['s3']['object']['key'] if(bucket == None or key == None): return # Temporary location to save file downloaded from S3 s3obj = tempfile.NamedTemporaryFile(mode='w+b',delete=False) # Download file to temp file s3.download_file(bucket, key, s3obj.name) with gzip.open(s3obj.name, 'rb') as f: if ("interface-id" not in f.readline().decode()): print("Not VPCFlow, exiting.") # return eventcount = 1 for line in f: event_array = line.split() event_dict = {} # Parse array to dict to prepare for JSON conversion event_dict['version'] = event_array[0].decode() event_dict['account-id'] = event_array[1].decode() event_dict['interface-id'] = event_array[2].decode() event_dict['srcaddr'] = event_array[3].decode() event_dict['dstaddr'] = event_array[4].decode() event_dict['srcport'] = event_array[5].decode() event_dict['dstport'] = event_array[6].decode() event_dict['protocol'] = event_array[7].decode() event_dict['packets'] = event_array[8].decode() event_dict['bytes'] = event_array[9].decode() event_dict['start'] = event_array[10].decode() event_dict['end'] = event_array[11].decode() event_dict['action'] = event_array[12].decode() event_dict['log-status'] = event_array[13].decode() # Prepare JSON to send to ES data = json.dumps(event_dict).encode('utf-8') print(data) event_date = dt.today().strftime('%Y-%m-%d') canonical_uri = '/' + indexname + '-' + event_date + '/_doc' # url endpoint for our ES cluster url = 'https://' + host + canonical_uri print( "Event {} url : {}\n".format(eventcount, url)) # aws signed url stuff - for comments on this check their example page linked on top comment t = datetime.datetime.utcnow() amz_date = t.strftime('%Y%m%dT%H%M%SZ') date_stamp = t.strftime('%Y%m%d') canonical_querystring = '' canonical_headers = 'content-type:' + content_type + '\n' + \ 'host:' + host + '\n' + \ 'x-amz-date:' + amz_date + '\n' signed_headers = 'content-type;host;x-amz-date' payload_hash = hashlib.sha256(data).hexdigest() canonical_request = method + '\n' + \ canonical_uri + '\n' + \ canonical_querystring + '\n' + \ canonical_headers + '\n' + \ signed_headers + '\n' + \ payload_hash algorithm = 'AWS4-HMAC-SHA256' credential_scope = date_stamp + '/' + region + '/' + service + '/' + 'aws4_request' string_to_sign = algorithm + '\n' + \ amz_date + '\n' + \ credential_scope + '\n' + \ hashlib.sha256(canonical_request.encode('utf-8')).hexdigest() signing_key = get_signature_key(secret_key, date_stamp, region, service) signature = hmac.new(signing_key, string_to_sign.encode('utf-8'), hashlib.sha256).hexdigest() authorization_header = algorithm + ' ' + \ 'Credential=' + access_key + '/' + credential_scope + ', ' + \ 'SignedHeaders=' + signed_headers + ', ' + \ 'Signature=' + signature headers = {'Content-Type':content_type, 'X-Amz-Date':amz_date, 'Authorization':authorization_header, 'X-Amz-Security-Token': session_token} # sends the json to elasticsearch req = requests.post(url, data=data, headers=headers) print( "Attempt 0 status code: {}".format(req.status_code)) print( "response:\n---\n{}\n---\n".format( req.text )) retry_counter = 1 """ if we fail for some reason we will retry 3 times you will most likely have errors if you're copying a huge ammount of logs from an old bucket to your new one. For normal usage you shouldnt have to worry about this. I got it in production with 90 aws accounts pointing to the same bucket, and a pair of m3.mediums on the ES cluster, with 0 errors. I dont raise an exception on errors to not miss all the other entries in the file, or risk repeating any inserts done before the error. """ # if our status code is not successfull, and our retry counter is less than 4 while (req.status_code != 201) and (retry_counter < 4): print( "Got code {}. Retrying {} of 3".format( req.status_code, retry_counter) ) # send the data to ES again req = requests.post(url, data=data, headers=headers) print( "status code: {}".format(req.status_code)) retry_counter += 1 eventcount +=1 s3obj.close() os.unlink(s3obj.name)
res = ss.run('cli -c "file compress file /var/tmp/showintext"') gz_flag = "syntax error" not in res[1] logging.warning(f"{ip} not support file compress file /var/tmp/showintext") ss.close() # если не telnet то качаем архив по SCP иначе делам cat файла и сохраняем его в архив if not telnet_flag: # with SCP(dev, progress=True) as scp: with SCP(dev) as scp: if gz_flag: scp.get("/var/tmp/showintext.gz", local_path=ip + "showintext.gz") else: scp.get("/var/tmp/showintext", local_path=ip + "showintext") fp = open(ip + "showintext", "rb") data = fp.read() bindata = bytearray(data) with gzip.open(ip + "showintext.gz", "wb") as f: f.write(bindata) if os.path.exists(ip + "showintext"): os.remove(ip + "showintext") fp.close() else: logging.warning(f"{ip} not work SCP cat file /var/tmp/showintext and gzip") with gzip.open(ip + "showintext.gz", mode="wb") as file: dev.timeout = 600 fs = FS(dev) result = fs.cat("/var/tmp/showintext") if result: file.write(result.encode()) # удаляем с шасси if not telnet_flag and gz_flag:
def build_devhelp(self, outdir, outname): self.info('dumping devhelp index...') # Basic info root = etree.Element('book', title=self.config.html_title, name=self.config.project, link="index.html", xmlns="http://www.devhelp.net/book", version="2", language="python") tree = etree.ElementTree(root) # TOC chapters = etree.SubElement(root, 'chapters') tocdoc = self.env.get_and_resolve_doctree(self.config.master_doc, self, prune_toctrees=False) def write_toc(node, parent): if isinstance(node, addnodes.compact_paragraph) or \ isinstance(node, nodes.bullet_list): for subnode in node: write_toc(subnode, parent) elif isinstance(node, nodes.list_item): item = etree.SubElement(parent, 'sub') for subnode in node: write_toc(subnode, item) elif isinstance(node, nodes.reference): parent.attrib['link'] = node['refuri'] parent.attrib['name'] = node.astext() def istoctree(node): return isinstance(node, addnodes.compact_paragraph) and \ 'toctree' in node for node in tocdoc.traverse(istoctree): write_toc(node, chapters) # Index functions = etree.SubElement(root, 'functions') index = self.env.create_index(self) def write_index(title, refs, subitems): if len(refs) == 0: pass elif len(refs) == 1: name = title xml_type = "function" if ' ' in title: func, rest = title.split(' ', 1) if rest.endswith('method)') and func.endswith('()'): complete_class_name = rest.rsplit(' ', 1)[0] canonical_class_name = complete_class_name.rsplit( '.', 1)[1] name = canonical_class_name + '.' + func elif rest.endswith('attribute)'): complete_class_name = rest.rsplit(' ', 1)[0] canonical_class_name = complete_class_name.rsplit( '.', 1)[1] name = canonical_class_name + '.' + func xml_type = "constant" etree.SubElement(functions, 'keyword', type=xml_type, name=name, link=refs[0][1]) else: for i, ref in enumerate(refs): etree.SubElement(functions, 'keyword', type="function", name="[%d] %s" % (i, title), link=ref[1]) if subitems: parent_title = re.sub(r'\s*\(.*\)\s*$', '', title) for subitem in subitems: write_index("%s %s" % (parent_title, subitem[0]), subitem[1], []) for (key, group) in index: for title, (refs, subitems) in group: write_index(title, refs, subitems) # Dump the XML file f = gzip.open(os.path.join(outdir, outname + '.devhelp2.gz'), 'w') try: tree.write(f) finally: f.close()
def save_object(fname, obj): cPickle.dump(obj, gzip.open(fname, "wb"))
def open_vcf_file(self, path): """ Gets a file object for an individual's VCF file. Args: path: path to VCF file (gzipped or text format). Returns: A file handle for the VCF file. """ if not os.path.exists(path): raise OSError("VCF file not found at: " + path) extension = os.path.splitext(path)[1] if extension == ".gz": # python2 gzip opens in text, but same mode in python3 opens as # bytes, avoid with platform specific code if IS_PYTHON2: handle = gzip.open(path, "r") elif IS_PYTHON3: handle = gzip.open(path, "rt") elif extension in [".vcf", ".txt"]: handle = io.open(path, "r", encoding="latin_1") else: raise OSError("unsupported filetype: " + path) return handle