Exemplo n.º 1
0
    def gdb_on_inferior_deleted(self, event):
        util.verbose("gdb_on_inferior_deleted()")

        def _mt():
            self.gdb_on_inferior_deleted__mT(event)
        
        gdb.post_event(_mt)
Exemplo n.º 2
0
 def get_pages( self, lang, search_term, nb_results ):
     """
     """
     pages = []
     r_count = 0 # Counts only pages that go to the final results
     p_count = 0 # Counts all pages processed, despite of ignored
     result_xml = self.send_query( lang, search_term )
     result_dom = xml.dom.minidom.parseString( result_xml )
     res_element = result_dom.getElementsByTagName( "RES" )[ 0 ]
     ending = int(res_element.getAttribute("EN"))
     total = int( self.get_field( res_element, "M" ) )
     verbose( "The query "+search_term+" returned "+str(total)+" results.")
     #pdb.set_trace()
     while r_count < nb_results and p_count < ending :
         try :
             for r in res_element.getElementsByTagName( 'R' ) :
                 if r_count < nb_results and p_count < ending :
                     page = self.build_page( r, r_count, p_count, \
                                             search_term, lang, total )
                     if page is not None :
                         pages.append( page )
                         verbose( "  Downloaded page " + str(r_count) )
                         r_count = r_count + 1
                     p_count = p_count + 1
                 else :
                     break
         except Exception, e :
             print >> sys.stderr, "Something went terribly wrong"
             pdb.set_trace()
             print e
         if r_count < nb_results and ending % 20 == 0:
             result_xml = self.send_query( lang, search_term, p_count )
             result_dom = xml.dom.minidom.parseString( result_xml )
             res_element = result_dom.getElementsByTagName( "RES" )[ 0 ]
             ending = int(res_element.getAttribute("EN"))
Exemplo n.º 3
0
	def build_suffix_arrays(self):
		"""
			Build suffix arrays for all attributes in the index.
		"""
		for attr in self.arrays.keys():
			verbose("Building suffix array for %s..." % attr)
			self.arrays[attr].build_suffix_array()
Exemplo n.º 4
0
	def load(self, attribute):
		"""
			Load an attribute from the corresponding index files.
			If the attribute is of the form `a1+a2` and the corresponding
			file does not exist, creates a new suffix array fusing the 
			arrays for attributes `a1` and `a2`.
		"""

		if self.arrays.has_key(attribute):
			return self.arrays[attribute]

		verbose("Loading corpus files for attribute \"%s\"." % attribute)
		array = SuffixArray()
		path = self.basepath + "." + attribute
		array.set_basepath(path)
		try:
			array.load()
		except IOError, err:
			# If attribute is composed, fuse the corresponding suffix arrays.
			if '+' in attribute:
				attr1, attr2 = attribute.rsplit('+', 1)

				verbose("Fusing suffix arrays for %s and %s..." % (attr1, attr2))
				array = fuse_suffix_arrays(self.load(attr1), self.load(attr2))

				array.set_basepath(path)
				array.build_suffix_array()
				array.save()

			else:
				raise err
Exemplo n.º 5
0
	def build_suffix_arrays(self):
		"""
			Build suffix arrays for all attributes in the index.
		"""
		for attr in self.arrays.keys():
			verbose("Building suffix array for %s..." % attr)
			self.arrays[attr].build_suffix_array()
Exemplo n.º 6
0
	def load(self, attribute):
		"""
			Load an attribute from the corresponding index files.
			If the attribute is of the form `a1+a2` and the corresponding
			file does not exist, creates a new suffix array fusing the 
			arrays for attributes `a1` and `a2`.
		"""

		if self.arrays.has_key(attribute):
			return self.arrays[attribute]

		verbose("Loading corpus files for attribute \"%s\"." % attribute)
		array = SuffixArray()
		path = self.basepath + "." + attribute
		array.set_basepath(path)
		try:
			array.load()
		except IOError, err:
			# If attribute is composed, fuse the corresponding suffix arrays.
			if '+' in attribute:
				attr1, attr2 = attribute.rsplit('+', 1)

				verbose("Fusing suffix arrays for %s and %s..." % (attr1, attr2))
				array = fuse_suffix_arrays(self.load(attr1), self.load(attr2))

				array.set_basepath(path)
				array.build_suffix_array()
				array.save()

			else:
				raise err
Exemplo n.º 7
0
    def _wsRead(self):
        while self.ws_connected:
            try:
                header0_16 = struct.unpack("!BB", self.connection.recv(2, socket.MSG_WAITALL))
                opcode = ord(header0_16[0]) & 0b00001111
                
                is_masked = ord(header0_16[1]) & -128
                plen = ord(header0_16[1]) & 127

                if plen == 126:
                    plen = int.from_bytes(struct.unpack("!BB", self.connection.recv(2, socket.MSG_WAITALL)), "big")
                elif plen == 127:
                    plen = int.from_bytes(struct.unpack("!BBBBBBBB", self.connection.recv(2, socket.MSG_WAITALL)), "big")
                
                if is_masked:
                    mkey = struct.unpack("!BBBB", self.connection.recv(4, socket.MSG_WAITALL))
                
                self.message = list(self.connection.recv(plen, socket.MSG_WAITALL))

                if is_masked:
                    for i in range(plen):
                        self.message[i] = chr(self.message[i] ^ ord(mkey[i%4]))
                
                self.message = "".join(self.message)

                self.handleMessage()
            except Exception as e:
                util.verbose("Websocket read error:", e)

                self.ws_connected = False
                self.handleClose()
Exemplo n.º 8
0
def _pull_image(image_reference: str, outfileobj=None):
    import util
    util.not_none(image_reference)

    transport = _mk_transport()

    image_reference = normalise_image_reference(image_reference)
    image_reference = _parse_image_reference(image_reference)
    creds = _mk_credentials(image_reference=image_reference)

    # OCI Image Manifest is compatible with Docker Image Manifest Version 2,
    # Schema 2. We indicate support for both formats by passing both media types
    # as 'Accept' headers.
    #
    # For reference:
    #   OCI: https://github.com/opencontainers/image-spec
    #   Docker: https://docs.docker.com/registry/spec/manifest-v2-2/
    accept = docker_http.SUPPORTED_MANIFEST_MIMES

    try:
        # XXX TODO: use streaming rather than writing to local FS
        # if outfile is given, we must use it instead of an ano
        outfileobj = outfileobj if outfileobj else tempfile.TemporaryFile()
        with tarfile.open(fileobj=outfileobj, mode='w:') as tar:
            util.verbose(f'Pulling manifest list from {image_reference}..')
            with image_list.FromRegistry(image_reference, creds,
                                         transport) as img_list:
                if img_list.exists():
                    platform = image_list.Platform({
                        'architecture': _PROCESSOR_ARCHITECTURE,
                        'os': _OPERATING_SYSTEM,
                    })
                    # pytype: disable=wrong-arg-types
                    with img_list.resolve(platform) as default_child:
                        save.tarball(_make_tag_if_digest(image_reference),
                                     default_child, tar)
                        return outfileobj
                    # pytype: enable=wrong-arg-types

            util.info(f'Pulling v2.2 image from {image_reference}..')
            with v2_2_image.FromRegistry(image_reference, creds, transport,
                                         accept) as v2_2_img:
                if v2_2_img.exists():
                    save.tarball(_make_tag_if_digest(image_reference),
                                 v2_2_img, tar)
                    return outfileobj

            util.info(f'Pulling v2 image from {image_reference}..')
            with v2_image.FromRegistry(image_reference, creds,
                                       transport) as v2_img:
                with v2_compat.V22FromV2(v2_img) as v2_2_img:
                    save.tarball(_make_tag_if_digest(image_reference),
                                 v2_2_img, tar)
                    return outfileobj
    except Exception as e:
        outfileobj.close()
        util.fail(f'Error pulling and saving image {image_reference}: {e}')
Exemplo n.º 9
0
    def gdb_on_new_inferior(self, event):
        util.verbose("gdb_on_new_inferior()")

        api.globalvars.inferior_run_times[event.inferior.num] = int(time.time())

        def _mt():
            self.gdb_on_new_inferior__mT(event)
        
        gdb.post_event(_mt)
Exemplo n.º 10
0
    def gdb_on_exited(self, event):
        util.verbose("gdb_on_exited()")

        api.globalvars.step_time = False

        def _mt():
            self.gdb_on_exited__mT(event)
        
        gdb.post_event(_mt)
Exemplo n.º 11
0
    def gdb_on_new_objfile(self, event):
        util.verbose("gdb_on_new_objfile()")

        api.globalvars.inferior_run_times[gdb.selected_inferior().num] = int(time.time())

        def _mt():
            self.gdb_on_new_objfile__mT(event)
        
        gdb.post_event(_mt)
Exemplo n.º 12
0
    def gdb_on_new_thread(self, event):
        util.verbose("gdb_on_new_thread()")

        if event.inferior_thread.inferior.num == 1:
            api.globalvars.inferior_run_times[event.inferior_thread.inferior.num] = int(time.time())
            
        def _mt():
            self.gdb_on_new_thread__mT(event)

        gdb.post_event(_mt)
Exemplo n.º 13
0
def run_mwu():

    stocks, dates = load_data()
    mwu = MWU(len(stocks))
    last_close = [None] * len(stocks)

    start_money = 0.0
    first_date = dates[0]
    for i, tup in enumerate(stocks.items()):
        stock, date_price = tup
        price = date_price[first_date]
        cls, vol, op, hi, lo = price[KEY_CLOSE], price[KEY_VOLUME], price[
            KEY_OPEN], price[KEY_HIGH], price[KEY_LOW]
        start_money += math.fabs(cls)

    money = start_money
    iteration = 0
    for date in dates:
        losses = []
        for i, tup in enumerate(stocks.items()):
            stock, date_price = tup
            price = date_price[date]
            cls, vol, op, hi, lo = price[KEY_CLOSE], price[KEY_VOLUME], price[
                KEY_OPEN], price[KEY_HIGH], price[KEY_LOW]

            if last_close[i] is not None:
                loss = last_close[i] - cls
                if not math.isfinite(loss):
                    error("invalid loss detected")
                losses.append(loss)

            last_close[i] = cls

        iter_loss = 0
        weights = mwu.get_weights()
        for i, loss in enumerate(losses):
            # We're going to say that every day, we sell everything then buy everything according to weights
            iter_loss = loss * weights[i] / sum(weights) * money / start_money
        money -= iter_loss

        mwu.run_iteration(losses, max_loss=money)
        iteration += 1
        if iteration % 100 == 0:
            verbose("{} iterations done".format(iteration), end="\r")

    verbose("\n{} iterations done".format(iteration))
    info("Money: {} vs {} at start, gain = {}".format(
        money, start_money, (money - start_money) / start_money))
    info("MWU reported loss = {}".format(mwu.get_loss()))
    weights = mwu.get_weights()
    losses = mwu.get_losses()
    info("Stats:")
    for i, stock in enumerate(stocks.keys()):
        info("{}: weight {}, loss {}".format(stock.upper(), weights[i],
                                             losses[i]))
Exemplo n.º 14
0
def open_index(prefix):
    """
        Open the index files (valid index created by the `index3.py` script). 
                
        @param index_filename The string name of the index file.
    """
    global vocab_file, ngrams_file, corpus_file, freq_name, the_corpus_size
    try:
        verbose("Loading index files... this may take some time.")
        verbose("Loading .vocab file")
        vocab_fd = shelve.open(prefix + ".vocab")
        vocab_file.update(vocab_fd)
        vocab_fd.close()
        verbose("Loading .corpus file")
        load_array_from_file(corpus_file, prefix + ".corpus")
        verbose("Loading .ngrams file")
        load_array_from_file(ngrams_file, prefix + ".ngrams")
        freq_name = re.sub(".*/", "", prefix)
        #pdb.set_trace()
        the_corpus_size = vocab_file[CORPUS_SIZE_KEY]
    except IOError:
        print >> sys.stderr, "Error opening the index."
        print >> sys.stderr, "Try again with another index filename."
        sys.exit(2)
    except KeyError:
        print >> sys.stderr, "Error opening the index."
        print >> sys.stderr, "Try again with another index filename."
        sys.exit(2)
Exemplo n.º 15
0
def open_index( prefix ) :
    """
        Open the index files (valid index created by the `index3.py` script). 
                
        @param index_filename The string name of the index file.
    """
    global vocab_file, ngrams_file, corpus_file, freq_name, the_corpus_size
    try :      
        verbose( "Loading index files... this may take some time." )
        verbose( "Loading .vocab file" )
        vocab_fd = shelve.open( prefix + ".vocab" )
        vocab_file.update( vocab_fd )
        vocab_fd.close()        
        verbose( "Loading .corpus file" )
        load_array_from_file( corpus_file, prefix + ".corpus" )
        verbose( "Loading .ngrams file" )
        load_array_from_file( ngrams_file, prefix + ".ngrams" )         
        freq_name = re.sub( ".*/", "", prefix )
        #pdb.set_trace()
        the_corpus_size = vocab_file[ CORPUS_SIZE_KEY ]              
    except IOError :        
        print >> sys.stderr, "Error opening the index."
        print >> sys.stderr, "Try again with another index filename."
        sys.exit( 2 )
    except KeyError :        
        print >> sys.stderr, "Error opening the index."
        print >> sys.stderr, "Try again with another index filename."
        sys.exit( 2 )        
Exemplo n.º 16
0
    def _scan_repository_for_definitions(
        self,
        repository,
        github_cfg,
        org_name,
    ) -> RawPipelineDefinitionDescriptor:
        for branch_name, cfg_entry in self._determine_repository_branches(
                repository=repository):
            try:
                definitions = repository.file_contents(
                    path='.ci/pipeline_definitions', ref=branch_name)
            except NotFoundError:
                continue  # no pipeline definition for this branch

            repo_hostname = urlparse(github_cfg.http_url()).hostname
            override_definitions = cfg_entry.override_definitions(
            ) if cfg_entry else {}

            verbose('from repo: ' + repository.name + ':' + branch_name)
            try:
                definitions = yaml.load(definitions.decoded.decode('utf-8'),
                                        Loader=yaml.SafeLoader)
            except BaseException as e:
                repo_path = f'{org_name}/{repository.name}'
                yield DefinitionDescriptor(
                    pipeline_name='<invalid YAML>',
                    pipeline_definition={},
                    main_repo={
                        'path': repo_path,
                        'branch': branch_name,
                        'hostname': repo_hostname
                    },
                    concourse_target_cfg=self.cfg_set.concourse(),
                    concourse_target_team=self.job_mapping.team_name(),
                    override_definitions=(),
                    exception=e,
                )
                return  # nothing else to yield in case parsing failed

            # handle inheritance
            definitions = merge_dicts(definitions, override_definitions)

            yield from self._wrap_into_descriptors(
                repo_path='/'.join([org_name, repository.name]),
                repo_hostname=repo_hostname,
                branch=branch_name,
                raw_definitions=definitions,
                override_definitions=override_definitions,
            )
Exemplo n.º 17
0
	def append_sentence(self, sentence):
		"""
			Adds a `Sentence` (presumably extracted from a XML file) to the index.
		"""

		for attr in self.used_word_attributes:
			for word in sentence.word_list:
				value = getattr(word, attr)
				self.arrays[attr].append_word(value)
			self.arrays[attr].append_word('')  # '' (symbol 0)  means end-of-sentence

		self.metadata["corpus_size"] += len(sentence.word_list)
		self.sentence_count += 1
		if self.sentence_count % 100 == 0:
			verbose("Processing sentence %d" % self.sentence_count)
Exemplo n.º 18
0
	def append_sentence(self, sentence):
		"""
			Adds a `Sentence` (presumably extracted from a XML file) to the index.
		"""

		for attr in self.used_word_attributes:
			for word in sentence.word_list:
				value = getattr(word, attr)
				self.arrays[attr].append_word(value)
			self.arrays[attr].append_word('')  # '' (symbol 0)  means end-of-sentence

		self.metadata["corpus_size"] += len(sentence.word_list)
		self.sentence_count += 1
		if self.sentence_count % 100 == 0:
			verbose("Processing sentence %d" % self.sentence_count)
Exemplo n.º 19
0
def _wait_for_shoot(namespace,
                    on_event,
                    expected_result,
                    timeout_seconds: int = 120):
    ensure_not_empty(namespace)
    start_time = int(time.time())

    custom_api = ctx.create_custom_api()
    w = watch.Watch()
    # very, very sad: workaround until fixed:
    #    https://github.com/kubernetes-incubator/client-python/issues/124
    # (after about a minute, "some" watches (e.g. not observed when watching namespaces),
    # return without an error.
    # apart from being ugly, this has the downside that existing events will repeatedly be
    # received each time the watch is re-applied
    should_exit = False
    result = None
    while not should_exit and (start_time + timeout_seconds) > time.time():
        try:
            for e in w.stream(
                    custom_api.list_namespaced_custom_object,
                    group='garden.sapcloud.io',
                    version='v1beta1',
                    namespace=namespace,
                    plural='shoots',
                    # we need to reduce the request-timeout due to our workaround
                    _request_timeout=(timeout_seconds -
                                      int(time.time() - start_time))):
                should_exit, result = on_event(e)
                if should_exit:
                    w.stop()
                    if result != expected_result:
                        raise RuntimeError(result)
                    return
        except ConnectionResetError as cre:
            # ignore connection errors against k8s api endpoint (these may be temporary)
            info('connection reset error from k8s API endpoint - ignored: ' +
                 str(cre))
        except ProtocolError as err:
            verbose('http connection error - ignored')
        except KeyError as err:
            verbose("key {} not yet available - ignored".format(str(err)))
    # handle case where timeout was exceeded, but w.stream returned erroneously (see bug
    # description above)
    raise RuntimeError(result)
Exemplo n.º 20
0
	def load(self, attribute):
		if self.arrays.has_key(attribute):
			return self.arrays[attribute]

		verbose("Loading corpus files for attribute \"%s\"." % attribute)
		array = SuffixArray()
		path = self.basepath + "." + attribute
		array.set_basepath(path)
		try:
			array.load()
		except IOError, err:
			# If attribute is composed, fuse the corresponding suffix arrays.
			if '+' in attribute:
				attr1, attr2 = attribute.rsplit('+', 1)
				array = fuse_suffix_arrays(self.load(attr1), self.load(attr2))

				array.set_basepath(path)
				array.build_suffix_array()
				array.save()

			else:
				raise err
Exemplo n.º 21
0
def open_index( prefix ) :
    """
        Open the index files (valid index created by the `index3.py` script). 
                
        @param index_filename The string name of the index file.
    """
    global freq_name, the_corpus_size
    global index, suffix_array
    try :      
        verbose( "Loading index files... this may take some time." )
        index = Index(prefix)
        index.load_metadata()
        freq_name = re.sub( ".*/", "", prefix )
        #pdb.set_trace()
        the_corpus_size = index.metadata["corpus_size"]
    except IOError :        
        print >> sys.stderr, "Error opening the index."
        print >> sys.stderr, "Try again with another index filename."
        sys.exit( 2 )
    except KeyError :        
        print >> sys.stderr, "Error opening the index."
        print >> sys.stderr, "Try again with another index filename."
        sys.exit( 2 )        
Exemplo n.º 22
0
def treat_entity( entity ) :
    """
        For each entity, searches for the individual word frequencies of the
        base ngram. The corresponding function, for a corpus index or for yahoo,
        will be called, depending on the -i or -y options. The frequencies are
        added as a child element of the word and then the candidate is printed.
        
        @param candidate The `Candidate` that is being read from the XML file.        
    """
    global entity_counter
    global low_limit, up_limit
    global count_vars
    if entity_counter % 100 == 0 :
        verbose( "Processing ngram number %(n)d" % { "n":entity_counter } )
    if ( entity_counter >= low_limit or low_limit < 0 ) and \
       ( entity_counter <= up_limit or up_limit < 0 ) :
        if count_vars :
            for var in entity.vars :
                append_counters( var )
        else :
            append_counters( entity )
    print entity.to_xml().encode( 'utf-8' )
    entity_counter += 1
Exemplo n.º 23
0
def treat_sentence( sentence ) :
    """
        For each sentence in the corpus, generates all the candidates that match
        at least one pattern in the patterns file (-p option) or all the
        ngrams that are in the valid range (-n option). The candidates are
        stored into a temporary file and will be further printed to a XML file.
        The temp file is used to avoid printing twice a repeated candidate and
        to count occurrences of the same candidate.
        
        @param sentence A `Sentence` that is being read from the XML file.    
    """
    global patterns, temp_file, ignore_pos, surface_instead_lemmas, \
           longest_pattern, shortest_pattern, sentence_counter
    if sentence_counter % 100 == 0 :
        verbose( "Processing sentence number %(n)d" % { "n":sentence_counter } )

    words = sentence.word_list

    for pattern in patterns:
        for match in match_pattern(pattern, words):
            match_ngram = Ngram(copy_word_list(match), [])

            if ignore_pos :    
                match_ngram.set_all( pos=WILDCARD )
            internal_key = unicode( match_ngram.to_string() ).encode('utf-8')

            if( surface_instead_lemmas ) :
                match_ngram.set_all( lemma=WILDCARD )
            else :
                match_ngram.set_all( surface=WILDCARD )                    
            key = unicode( match_ngram.to_string() ).encode('utf-8')
            ( surfaces_dict, total_freq ) = temp_file.get( key, ( {}, 0 ) )
            freq_surface = surfaces_dict.get( internal_key, 0 )
            surfaces_dict[ internal_key ] = freq_surface + 1
            temp_file[ key ] = ( surfaces_dict, total_freq + 1 )

    sentence_counter += 1
Exemplo n.º 24
0
 def build_page( self, r, r_count, p_count, search_term, lang, total ) :
     """
         Based on an xml 'R' element, builds a page object
     """
     page = None
     url = self.get_field( r, "UE" )
     verbose( "***** Result " + str( p_count ) )
     #if p_count == 16 :
     #    pdb.set_trace()
     if not ( url.endswith( ( ".pdf", ".PDF" ) ) or \
              url.endswith( ( ".doc", ".DOC" ) ) ) :
         title = self.get_field( r, "TNB" )
         date = str( datetime.date.today() )                   
         snippet = self.split_sent( self.clean( self.get_field(r, "SNB" ) ) )
         raw_text = self.clean( self.html2txt( self.get_field( r, "U" ) ) )
         text = self.split_sent( raw_text )
         if len(text) > 1 : # html correctly converted to txt
             page = GooglePage( search_term, r_count, p_count, lang,\
                                date, url, title, snippet, text, total )
         else :
             verbose( "  Ignored unreadable html" )
     else :
         verbose( "  Ignored pdf or doc page " + url )                       
     return page
Exemplo n.º 25
0
def main():
    candidates = {}
    
    if surface_instead_lemmas:
        base_attr = 'surface'
    else:
        base_attr = 'lemma'

    def dump(sentence_id, positions, absolute_positions, key, glue):
        (surfaces_dict, total_freq, _) = candidates.get(key, ({}, 0, -1))
        surface_key = tuple([index.arrays['surface'].corpus[j] for j in absolute_positions])
        surfaces_dict.setdefault(surface_key, []).append(
            str(sentence_id) + ":" + ",".join(map(str, positions)))
        candidates[key] = (surfaces_dict, total_freq + 1, glue)

    index = Index(index_basepath)
    index.load_metadata()
    index.load(base_attr)
    index.load('surface')
    extract(index, base_attr, gluefun, dumpfun=dump, min_ngram=min_ngram,
            max_ngram=max_ngram, corpus_length_limit=corpus_length_limit)

    verbose("Outputting candidates file...")
    print XML_HEADER % { "root": "candidates", "ns": "" }

    meta = Meta([CorpusSize("corpus", index.metadata["corpus_size"])],
                [MetaFeat("glue", "real")], [])
    print meta.to_xml().encode('utf-8')

    id_number = 0

    for key in candidates:
        (surfaces_dict, total_freq, glue) = candidates[key]
        if total_freq >= min_frequency:
            # Make <cand> entry (usually lemma-based)
            cand = Candidate(id_number, [], [], [], [], [])
            for j in key:
                w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, [])
                setattr(w, base_attr, index.arrays[base_attr].symbols.number_to_symbol[j])
                cand.append(w)
            freq = Frequency('corpus', total_freq)
            cand.add_frequency(freq)
            cand.add_feat(Feature("glue", glue))


            # Add surface forms.
            for surface_key in surfaces_dict:
                occur_form = Ngram([], [])
                for j in surface_key:
                    w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, [])
                    w.surface = index.arrays['surface'].symbols.number_to_symbol[j]
                    occur_form.append(w)
                sources = surfaces_dict[surface_key]
                freq_value = len(sources)
                freq = Frequency('corpus', freq_value)
                occur_form.add_frequency(freq)
                occur_form.add_sources(sources)
                cand.add_occur(occur_form)

            print cand.to_xml().encode('utf-8')
            id_number += 1

    print XML_FOOTER % { "root": "candidates" }
Exemplo n.º 26
0
    treat_options_simplest( opts, arg, n_arg, usage_string )

################################################################################
# MAIN SCRIPT

longopts = ["yahoo", "google", "index=", "verbose", "ignore-pos", "surface",\
            "from=", "to=", "text", "vars", "lang=" ]
arg = read_options( "ywi:vgsf:t:xal:", longopts, treat_options, -1, usage_string )

try : 
    parser = xml.sax.make_parser()
    handler = GenericXMLHandler( treat_meta=treat_meta,
                                 treat_entity=treat_entity,
                                 gen_xml=True )
    parser.setContentHandler( handler )
    verbose( "Counting ngrams in candidates file" )
    if len( arg ) == 0 :
        if text_input :
            treat_text( sys.stdin )
        else :
            parser.parse( sys.stdin )
            print handler.footer
    else :
        for a in arg :
            input_file = open( a )
            if text_input :
                treat_text( input_file )
            else :
                parser.parse( input_file )
                footer = handler.footer
                handler.gen_xml = False