def define_in_namespace(self, namespace, computational_graph_definition): if self.namespace is not None: raise ValueError("model may only be initialized once.") self.namespace = check.check_not_empty(namespace) self.save_name = check.check_not_empty( re.sub("[^a-zA-Z0-9]", "", self.namespace)) with tf.variable_scope(self.namespace): logging.debug("Defining computational graph.") computational_graph_definition() self.saver = tf.train.Saver( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.namespace)) config = tf.ConfigProto() config.gpu_options.allow_growth = True safe_str = "" if safe_on: config.graph_options.rewrite_options.arithmetic_optimization = rewriter_config_pb2.RewriterConfig.OFF safe_str = "safely " session = tf.Session(config=config) session.run(tf.global_variables_initializer()) logging.debug( "Defined computational graph and %sinitialized session." % safe_str) return session
def perplexity(probabilities): check.check_not_empty(probabilities) total_log_probability = 0.0 for probability in probabilities: if probability < 0.0 or probability > 1.0: raise ValueError("Invalid probability [0, 1]: %f." % probability) total_log_probability += math.log2(ZERO_PROBABILITY if probability == 0 else probability) return math.pow(2.0, -total_log_probability / len(probabilities))
def __init__(self, sequence, predicted, expected): self.sequence = check.check_not_empty(sequence) self.predicted = check.check_instance(predicted, SequenceStats) self.expected = check.check_instance(expected, SequenceStats) check.check_length(self.predicted.values, len(self.sequence)) check.check_length(self.expected.values, len(self.sequence)) self.perplexity = perplexity(self.expected.probabilities)
def __init__(self, name, layer, width): self.name = check.check_not_empty(name) self.layer = layer if layer is not None: check.check_gte(layer, 0) self.width = check.check_gte(width, 1)
def __init__(self, name, default, plural=False, canonicalize=lambda v: v): self.name = check.check_not_empty(name) self.default = default self.plural = check.check_one_of(plural, [True, False]) self.canonicalize = canonicalize self.key = self.name if not self.plural else "%s[]" % self.name
def __init__(self, words): super(Term, self).__init__() self.words = tuple(check.check_not_empty(words)) self._hash = None
def __init__(self, word): self.word = check.check_not_empty(check.check_not_none(word)) self.literal = canonicalize_word(word)
def parse(self, input_stream): pages = [] parse_terms = set() for line in input_stream: for item in line.split("."): page_id = item.strip() if page_id != "": if os.path.exists(self._page_file_contents(page_id)): with open(self._page_file_contents(page_id), "r", encoding="utf-8") as fh: page_content = fh.read() with open(self._page_file_links(page_id), "r", encoding="utf-8") as fh: page_links = [l.strip() for l in fh.readlines()] else: split = page_id.split("#") page = wikipedia.page(split[0]) try: if not page.exists(): raise errors.Invalid( "Missing wikipedia page '%s'." % split[0]) except requests.exceptions.ReadTimeout as e: raise errors.Invalid( "Missing wikipedia page '%s'." % split[0]) if len(split) == 1: page_content = check.check_not_empty( CLEANER(page.summary)) else: page_content = "" for section in (page.section_titles if len(split) == 1 else split[1:]): if section not in self.SECTION_BLACKLIST: logging.debug("Page '%s' using section '%s'." % (page_id, section)) raw_section_content = page.section_by_title( section).text if raw_section_content is not None and len( raw_section_content) > 0: section_content = CLEANER( raw_section_content) if len(section_content) > 0: page_content += " " + section_content page_links = [CLEANER(l) for l in page.links] pages += [page_id] if not os.path.exists(self._page_file_contents(page_id)): with open(self._page_file_contents(page_id), "w", encoding="utf-8") as fh: fh.write(page_content.replace("\n", "\n\n")) with open(self._page_file_links(page_id), "w", encoding="utf-8") as fh: for link in page_links: fh.write("%s\n" % link) page_terms = set() for page_term in self._extract_links( page_id, page_links, page_content): page_terms.add(page_term) for term in page_terms: self.terms.add(term) if term not in parse_terms: logging.debug("Page '%s' adding term '%s'." % (page_id, term)) parse_terms.add(term) self.inflections.record(term, term) terms_trie = build_trie(parse_terms) for page_id in pages: with open(self._page_file_contents(page_id), "r", encoding="utf-8") as fh: page_content = fh.read() sentences = nlp.split_sentences(page_content, self.paragraphs) maximum_offset = math.ceil(float(len(sentences)) / self.window) for offset in range(0, maximum_offset): # List of lists vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv sub_corpus = [ word for sentence in sentences[offset:offset + self.window] for word in sentence ] reference_terms = nlp.extract_terms( corpus=sub_corpus, terms_trie=terms_trie, lemmatizer=CANONICALIZER, inflection_recorder=self.inflections.record) logging.debug("Page '%s' reference terms: %s" % (page_id, reference_terms)) for a in reference_terms: for b in reference_terms: if a != b: if a not in self.cooccurrences: self.cooccurrences[a] = {} if b not in self.cooccurrences[a]: self.cooccurrences[a][b] = [] #self.cooccurrences[a].add(b) self.cooccurrences[a][b] += [sub_corpus]