def gdb_on_inferior_deleted(self, event): util.verbose("gdb_on_inferior_deleted()") def _mt(): self.gdb_on_inferior_deleted__mT(event) gdb.post_event(_mt)
def get_pages( self, lang, search_term, nb_results ): """ """ pages = [] r_count = 0 # Counts only pages that go to the final results p_count = 0 # Counts all pages processed, despite of ignored result_xml = self.send_query( lang, search_term ) result_dom = xml.dom.minidom.parseString( result_xml ) res_element = result_dom.getElementsByTagName( "RES" )[ 0 ] ending = int(res_element.getAttribute("EN")) total = int( self.get_field( res_element, "M" ) ) verbose( "The query "+search_term+" returned "+str(total)+" results.") #pdb.set_trace() while r_count < nb_results and p_count < ending : try : for r in res_element.getElementsByTagName( 'R' ) : if r_count < nb_results and p_count < ending : page = self.build_page( r, r_count, p_count, \ search_term, lang, total ) if page is not None : pages.append( page ) verbose( " Downloaded page " + str(r_count) ) r_count = r_count + 1 p_count = p_count + 1 else : break except Exception, e : print >> sys.stderr, "Something went terribly wrong" pdb.set_trace() print e if r_count < nb_results and ending % 20 == 0: result_xml = self.send_query( lang, search_term, p_count ) result_dom = xml.dom.minidom.parseString( result_xml ) res_element = result_dom.getElementsByTagName( "RES" )[ 0 ] ending = int(res_element.getAttribute("EN"))
def build_suffix_arrays(self): """ Build suffix arrays for all attributes in the index. """ for attr in self.arrays.keys(): verbose("Building suffix array for %s..." % attr) self.arrays[attr].build_suffix_array()
def load(self, attribute): """ Load an attribute from the corresponding index files. If the attribute is of the form `a1+a2` and the corresponding file does not exist, creates a new suffix array fusing the arrays for attributes `a1` and `a2`. """ if self.arrays.has_key(attribute): return self.arrays[attribute] verbose("Loading corpus files for attribute \"%s\"." % attribute) array = SuffixArray() path = self.basepath + "." + attribute array.set_basepath(path) try: array.load() except IOError, err: # If attribute is composed, fuse the corresponding suffix arrays. if '+' in attribute: attr1, attr2 = attribute.rsplit('+', 1) verbose("Fusing suffix arrays for %s and %s..." % (attr1, attr2)) array = fuse_suffix_arrays(self.load(attr1), self.load(attr2)) array.set_basepath(path) array.build_suffix_array() array.save() else: raise err
def _wsRead(self): while self.ws_connected: try: header0_16 = struct.unpack("!BB", self.connection.recv(2, socket.MSG_WAITALL)) opcode = ord(header0_16[0]) & 0b00001111 is_masked = ord(header0_16[1]) & -128 plen = ord(header0_16[1]) & 127 if plen == 126: plen = int.from_bytes(struct.unpack("!BB", self.connection.recv(2, socket.MSG_WAITALL)), "big") elif plen == 127: plen = int.from_bytes(struct.unpack("!BBBBBBBB", self.connection.recv(2, socket.MSG_WAITALL)), "big") if is_masked: mkey = struct.unpack("!BBBB", self.connection.recv(4, socket.MSG_WAITALL)) self.message = list(self.connection.recv(plen, socket.MSG_WAITALL)) if is_masked: for i in range(plen): self.message[i] = chr(self.message[i] ^ ord(mkey[i%4])) self.message = "".join(self.message) self.handleMessage() except Exception as e: util.verbose("Websocket read error:", e) self.ws_connected = False self.handleClose()
def _pull_image(image_reference: str, outfileobj=None): import util util.not_none(image_reference) transport = _mk_transport() image_reference = normalise_image_reference(image_reference) image_reference = _parse_image_reference(image_reference) creds = _mk_credentials(image_reference=image_reference) # OCI Image Manifest is compatible with Docker Image Manifest Version 2, # Schema 2. We indicate support for both formats by passing both media types # as 'Accept' headers. # # For reference: # OCI: https://github.com/opencontainers/image-spec # Docker: https://docs.docker.com/registry/spec/manifest-v2-2/ accept = docker_http.SUPPORTED_MANIFEST_MIMES try: # XXX TODO: use streaming rather than writing to local FS # if outfile is given, we must use it instead of an ano outfileobj = outfileobj if outfileobj else tempfile.TemporaryFile() with tarfile.open(fileobj=outfileobj, mode='w:') as tar: util.verbose(f'Pulling manifest list from {image_reference}..') with image_list.FromRegistry(image_reference, creds, transport) as img_list: if img_list.exists(): platform = image_list.Platform({ 'architecture': _PROCESSOR_ARCHITECTURE, 'os': _OPERATING_SYSTEM, }) # pytype: disable=wrong-arg-types with img_list.resolve(platform) as default_child: save.tarball(_make_tag_if_digest(image_reference), default_child, tar) return outfileobj # pytype: enable=wrong-arg-types util.info(f'Pulling v2.2 image from {image_reference}..') with v2_2_image.FromRegistry(image_reference, creds, transport, accept) as v2_2_img: if v2_2_img.exists(): save.tarball(_make_tag_if_digest(image_reference), v2_2_img, tar) return outfileobj util.info(f'Pulling v2 image from {image_reference}..') with v2_image.FromRegistry(image_reference, creds, transport) as v2_img: with v2_compat.V22FromV2(v2_img) as v2_2_img: save.tarball(_make_tag_if_digest(image_reference), v2_2_img, tar) return outfileobj except Exception as e: outfileobj.close() util.fail(f'Error pulling and saving image {image_reference}: {e}')
def gdb_on_new_inferior(self, event): util.verbose("gdb_on_new_inferior()") api.globalvars.inferior_run_times[event.inferior.num] = int(time.time()) def _mt(): self.gdb_on_new_inferior__mT(event) gdb.post_event(_mt)
def gdb_on_exited(self, event): util.verbose("gdb_on_exited()") api.globalvars.step_time = False def _mt(): self.gdb_on_exited__mT(event) gdb.post_event(_mt)
def gdb_on_new_objfile(self, event): util.verbose("gdb_on_new_objfile()") api.globalvars.inferior_run_times[gdb.selected_inferior().num] = int(time.time()) def _mt(): self.gdb_on_new_objfile__mT(event) gdb.post_event(_mt)
def gdb_on_new_thread(self, event): util.verbose("gdb_on_new_thread()") if event.inferior_thread.inferior.num == 1: api.globalvars.inferior_run_times[event.inferior_thread.inferior.num] = int(time.time()) def _mt(): self.gdb_on_new_thread__mT(event) gdb.post_event(_mt)
def run_mwu(): stocks, dates = load_data() mwu = MWU(len(stocks)) last_close = [None] * len(stocks) start_money = 0.0 first_date = dates[0] for i, tup in enumerate(stocks.items()): stock, date_price = tup price = date_price[first_date] cls, vol, op, hi, lo = price[KEY_CLOSE], price[KEY_VOLUME], price[ KEY_OPEN], price[KEY_HIGH], price[KEY_LOW] start_money += math.fabs(cls) money = start_money iteration = 0 for date in dates: losses = [] for i, tup in enumerate(stocks.items()): stock, date_price = tup price = date_price[date] cls, vol, op, hi, lo = price[KEY_CLOSE], price[KEY_VOLUME], price[ KEY_OPEN], price[KEY_HIGH], price[KEY_LOW] if last_close[i] is not None: loss = last_close[i] - cls if not math.isfinite(loss): error("invalid loss detected") losses.append(loss) last_close[i] = cls iter_loss = 0 weights = mwu.get_weights() for i, loss in enumerate(losses): # We're going to say that every day, we sell everything then buy everything according to weights iter_loss = loss * weights[i] / sum(weights) * money / start_money money -= iter_loss mwu.run_iteration(losses, max_loss=money) iteration += 1 if iteration % 100 == 0: verbose("{} iterations done".format(iteration), end="\r") verbose("\n{} iterations done".format(iteration)) info("Money: {} vs {} at start, gain = {}".format( money, start_money, (money - start_money) / start_money)) info("MWU reported loss = {}".format(mwu.get_loss())) weights = mwu.get_weights() losses = mwu.get_losses() info("Stats:") for i, stock in enumerate(stocks.keys()): info("{}: weight {}, loss {}".format(stock.upper(), weights[i], losses[i]))
def open_index(prefix): """ Open the index files (valid index created by the `index3.py` script). @param index_filename The string name of the index file. """ global vocab_file, ngrams_file, corpus_file, freq_name, the_corpus_size try: verbose("Loading index files... this may take some time.") verbose("Loading .vocab file") vocab_fd = shelve.open(prefix + ".vocab") vocab_file.update(vocab_fd) vocab_fd.close() verbose("Loading .corpus file") load_array_from_file(corpus_file, prefix + ".corpus") verbose("Loading .ngrams file") load_array_from_file(ngrams_file, prefix + ".ngrams") freq_name = re.sub(".*/", "", prefix) #pdb.set_trace() the_corpus_size = vocab_file[CORPUS_SIZE_KEY] except IOError: print >> sys.stderr, "Error opening the index." print >> sys.stderr, "Try again with another index filename." sys.exit(2) except KeyError: print >> sys.stderr, "Error opening the index." print >> sys.stderr, "Try again with another index filename." sys.exit(2)
def open_index( prefix ) : """ Open the index files (valid index created by the `index3.py` script). @param index_filename The string name of the index file. """ global vocab_file, ngrams_file, corpus_file, freq_name, the_corpus_size try : verbose( "Loading index files... this may take some time." ) verbose( "Loading .vocab file" ) vocab_fd = shelve.open( prefix + ".vocab" ) vocab_file.update( vocab_fd ) vocab_fd.close() verbose( "Loading .corpus file" ) load_array_from_file( corpus_file, prefix + ".corpus" ) verbose( "Loading .ngrams file" ) load_array_from_file( ngrams_file, prefix + ".ngrams" ) freq_name = re.sub( ".*/", "", prefix ) #pdb.set_trace() the_corpus_size = vocab_file[ CORPUS_SIZE_KEY ] except IOError : print >> sys.stderr, "Error opening the index." print >> sys.stderr, "Try again with another index filename." sys.exit( 2 ) except KeyError : print >> sys.stderr, "Error opening the index." print >> sys.stderr, "Try again with another index filename." sys.exit( 2 )
def _scan_repository_for_definitions( self, repository, github_cfg, org_name, ) -> RawPipelineDefinitionDescriptor: for branch_name, cfg_entry in self._determine_repository_branches( repository=repository): try: definitions = repository.file_contents( path='.ci/pipeline_definitions', ref=branch_name) except NotFoundError: continue # no pipeline definition for this branch repo_hostname = urlparse(github_cfg.http_url()).hostname override_definitions = cfg_entry.override_definitions( ) if cfg_entry else {} verbose('from repo: ' + repository.name + ':' + branch_name) try: definitions = yaml.load(definitions.decoded.decode('utf-8'), Loader=yaml.SafeLoader) except BaseException as e: repo_path = f'{org_name}/{repository.name}' yield DefinitionDescriptor( pipeline_name='<invalid YAML>', pipeline_definition={}, main_repo={ 'path': repo_path, 'branch': branch_name, 'hostname': repo_hostname }, concourse_target_cfg=self.cfg_set.concourse(), concourse_target_team=self.job_mapping.team_name(), override_definitions=(), exception=e, ) return # nothing else to yield in case parsing failed # handle inheritance definitions = merge_dicts(definitions, override_definitions) yield from self._wrap_into_descriptors( repo_path='/'.join([org_name, repository.name]), repo_hostname=repo_hostname, branch=branch_name, raw_definitions=definitions, override_definitions=override_definitions, )
def append_sentence(self, sentence): """ Adds a `Sentence` (presumably extracted from a XML file) to the index. """ for attr in self.used_word_attributes: for word in sentence.word_list: value = getattr(word, attr) self.arrays[attr].append_word(value) self.arrays[attr].append_word('') # '' (symbol 0) means end-of-sentence self.metadata["corpus_size"] += len(sentence.word_list) self.sentence_count += 1 if self.sentence_count % 100 == 0: verbose("Processing sentence %d" % self.sentence_count)
def _wait_for_shoot(namespace, on_event, expected_result, timeout_seconds: int = 120): ensure_not_empty(namespace) start_time = int(time.time()) custom_api = ctx.create_custom_api() w = watch.Watch() # very, very sad: workaround until fixed: # https://github.com/kubernetes-incubator/client-python/issues/124 # (after about a minute, "some" watches (e.g. not observed when watching namespaces), # return without an error. # apart from being ugly, this has the downside that existing events will repeatedly be # received each time the watch is re-applied should_exit = False result = None while not should_exit and (start_time + timeout_seconds) > time.time(): try: for e in w.stream( custom_api.list_namespaced_custom_object, group='garden.sapcloud.io', version='v1beta1', namespace=namespace, plural='shoots', # we need to reduce the request-timeout due to our workaround _request_timeout=(timeout_seconds - int(time.time() - start_time))): should_exit, result = on_event(e) if should_exit: w.stop() if result != expected_result: raise RuntimeError(result) return except ConnectionResetError as cre: # ignore connection errors against k8s api endpoint (these may be temporary) info('connection reset error from k8s API endpoint - ignored: ' + str(cre)) except ProtocolError as err: verbose('http connection error - ignored') except KeyError as err: verbose("key {} not yet available - ignored".format(str(err))) # handle case where timeout was exceeded, but w.stream returned erroneously (see bug # description above) raise RuntimeError(result)
def load(self, attribute): if self.arrays.has_key(attribute): return self.arrays[attribute] verbose("Loading corpus files for attribute \"%s\"." % attribute) array = SuffixArray() path = self.basepath + "." + attribute array.set_basepath(path) try: array.load() except IOError, err: # If attribute is composed, fuse the corresponding suffix arrays. if '+' in attribute: attr1, attr2 = attribute.rsplit('+', 1) array = fuse_suffix_arrays(self.load(attr1), self.load(attr2)) array.set_basepath(path) array.build_suffix_array() array.save() else: raise err
def open_index( prefix ) : """ Open the index files (valid index created by the `index3.py` script). @param index_filename The string name of the index file. """ global freq_name, the_corpus_size global index, suffix_array try : verbose( "Loading index files... this may take some time." ) index = Index(prefix) index.load_metadata() freq_name = re.sub( ".*/", "", prefix ) #pdb.set_trace() the_corpus_size = index.metadata["corpus_size"] except IOError : print >> sys.stderr, "Error opening the index." print >> sys.stderr, "Try again with another index filename." sys.exit( 2 ) except KeyError : print >> sys.stderr, "Error opening the index." print >> sys.stderr, "Try again with another index filename." sys.exit( 2 )
def treat_entity( entity ) : """ For each entity, searches for the individual word frequencies of the base ngram. The corresponding function, for a corpus index or for yahoo, will be called, depending on the -i or -y options. The frequencies are added as a child element of the word and then the candidate is printed. @param candidate The `Candidate` that is being read from the XML file. """ global entity_counter global low_limit, up_limit global count_vars if entity_counter % 100 == 0 : verbose( "Processing ngram number %(n)d" % { "n":entity_counter } ) if ( entity_counter >= low_limit or low_limit < 0 ) and \ ( entity_counter <= up_limit or up_limit < 0 ) : if count_vars : for var in entity.vars : append_counters( var ) else : append_counters( entity ) print entity.to_xml().encode( 'utf-8' ) entity_counter += 1
def treat_sentence( sentence ) : """ For each sentence in the corpus, generates all the candidates that match at least one pattern in the patterns file (-p option) or all the ngrams that are in the valid range (-n option). The candidates are stored into a temporary file and will be further printed to a XML file. The temp file is used to avoid printing twice a repeated candidate and to count occurrences of the same candidate. @param sentence A `Sentence` that is being read from the XML file. """ global patterns, temp_file, ignore_pos, surface_instead_lemmas, \ longest_pattern, shortest_pattern, sentence_counter if sentence_counter % 100 == 0 : verbose( "Processing sentence number %(n)d" % { "n":sentence_counter } ) words = sentence.word_list for pattern in patterns: for match in match_pattern(pattern, words): match_ngram = Ngram(copy_word_list(match), []) if ignore_pos : match_ngram.set_all( pos=WILDCARD ) internal_key = unicode( match_ngram.to_string() ).encode('utf-8') if( surface_instead_lemmas ) : match_ngram.set_all( lemma=WILDCARD ) else : match_ngram.set_all( surface=WILDCARD ) key = unicode( match_ngram.to_string() ).encode('utf-8') ( surfaces_dict, total_freq ) = temp_file.get( key, ( {}, 0 ) ) freq_surface = surfaces_dict.get( internal_key, 0 ) surfaces_dict[ internal_key ] = freq_surface + 1 temp_file[ key ] = ( surfaces_dict, total_freq + 1 ) sentence_counter += 1
def build_page( self, r, r_count, p_count, search_term, lang, total ) : """ Based on an xml 'R' element, builds a page object """ page = None url = self.get_field( r, "UE" ) verbose( "***** Result " + str( p_count ) ) #if p_count == 16 : # pdb.set_trace() if not ( url.endswith( ( ".pdf", ".PDF" ) ) or \ url.endswith( ( ".doc", ".DOC" ) ) ) : title = self.get_field( r, "TNB" ) date = str( datetime.date.today() ) snippet = self.split_sent( self.clean( self.get_field(r, "SNB" ) ) ) raw_text = self.clean( self.html2txt( self.get_field( r, "U" ) ) ) text = self.split_sent( raw_text ) if len(text) > 1 : # html correctly converted to txt page = GooglePage( search_term, r_count, p_count, lang,\ date, url, title, snippet, text, total ) else : verbose( " Ignored unreadable html" ) else : verbose( " Ignored pdf or doc page " + url ) return page
def main(): candidates = {} if surface_instead_lemmas: base_attr = 'surface' else: base_attr = 'lemma' def dump(sentence_id, positions, absolute_positions, key, glue): (surfaces_dict, total_freq, _) = candidates.get(key, ({}, 0, -1)) surface_key = tuple([index.arrays['surface'].corpus[j] for j in absolute_positions]) surfaces_dict.setdefault(surface_key, []).append( str(sentence_id) + ":" + ",".join(map(str, positions))) candidates[key] = (surfaces_dict, total_freq + 1, glue) index = Index(index_basepath) index.load_metadata() index.load(base_attr) index.load('surface') extract(index, base_attr, gluefun, dumpfun=dump, min_ngram=min_ngram, max_ngram=max_ngram, corpus_length_limit=corpus_length_limit) verbose("Outputting candidates file...") print XML_HEADER % { "root": "candidates", "ns": "" } meta = Meta([CorpusSize("corpus", index.metadata["corpus_size"])], [MetaFeat("glue", "real")], []) print meta.to_xml().encode('utf-8') id_number = 0 for key in candidates: (surfaces_dict, total_freq, glue) = candidates[key] if total_freq >= min_frequency: # Make <cand> entry (usually lemma-based) cand = Candidate(id_number, [], [], [], [], []) for j in key: w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, []) setattr(w, base_attr, index.arrays[base_attr].symbols.number_to_symbol[j]) cand.append(w) freq = Frequency('corpus', total_freq) cand.add_frequency(freq) cand.add_feat(Feature("glue", glue)) # Add surface forms. for surface_key in surfaces_dict: occur_form = Ngram([], []) for j in surface_key: w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, []) w.surface = index.arrays['surface'].symbols.number_to_symbol[j] occur_form.append(w) sources = surfaces_dict[surface_key] freq_value = len(sources) freq = Frequency('corpus', freq_value) occur_form.add_frequency(freq) occur_form.add_sources(sources) cand.add_occur(occur_form) print cand.to_xml().encode('utf-8') id_number += 1 print XML_FOOTER % { "root": "candidates" }
treat_options_simplest( opts, arg, n_arg, usage_string ) ################################################################################ # MAIN SCRIPT longopts = ["yahoo", "google", "index=", "verbose", "ignore-pos", "surface",\ "from=", "to=", "text", "vars", "lang=" ] arg = read_options( "ywi:vgsf:t:xal:", longopts, treat_options, -1, usage_string ) try : parser = xml.sax.make_parser() handler = GenericXMLHandler( treat_meta=treat_meta, treat_entity=treat_entity, gen_xml=True ) parser.setContentHandler( handler ) verbose( "Counting ngrams in candidates file" ) if len( arg ) == 0 : if text_input : treat_text( sys.stdin ) else : parser.parse( sys.stdin ) print handler.footer else : for a in arg : input_file = open( a ) if text_input : treat_text( input_file ) else : parser.parse( input_file ) footer = handler.footer handler.gen_xml = False