def _getMLU(self, fileid): sents = self._get_words(fileid, speaker='CHI', sent=True, stem=True, relation=False, pos=True, strip_space=True, replace=True) results = [] lastSent = [] numFillers = 0 for sent in sents: posList = [pos for (word,pos) in sent] # if any part of the sentence is intelligible if any(pos == 'unk' for pos in posList): next # if the sentence is null elif sent == []: next # if the sentence is the same as the last sent elif sent == lastSent: next else: results.append([word for (word,pos) in sent]) # count number of fillers numFillers += posList.count('co') numFillers += posList.count('None') lastSent = sent try: thisWordList = flatten(results) # count number of morphemes (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes) numWords = float(len(flatten([word.split('-') for word in thisWordList]))) - numFillers numSents = float(len(results)) mlu = numWords/numSents except ZeroDivisionError: mlu = 0 # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents} return mlu
def get_tag_sequence(self): self.tags = flatten([max([(x,y) for x,y in self.Table.items() if x[0] == self.n], key=lambda k: k[-1]["score"])[0][-2:]]) i = range(1, self.n - 1) i.reverse() i = array(i) + 2 for k in i: self.tags.insert(0, self.Table[tuple(flatten(k, self.tags[:2]))]["t"])
def get_tag_sequence(self): self.tags = flatten([ max([(x, y) for x, y in self.Table.items() if x[0] == self.n], key=lambda k: k[-1]["score"])[0][-2:] ]) i = range(1, self.n - 1) i.reverse() i = array(i) + 2 for k in i: self.tags.insert(0, self.Table[tuple(flatten(k, self.tags[:2]))]["t"])
def _getMLU(self, fileid, speaker): sents = self._get_words( fileid, speaker=speaker, sent=True, stem=True, relation=False, pos=True, strip_space=True, replace=True, ) results = [] lastSent = [] numFillers = 0 sentDiscount = 0 for sent in sents: posList = [pos for (word, pos) in sent] # if any part of the sentence is intelligible if any(pos == "unk" for pos in posList): continue # if the sentence is null elif sent == []: continue # if the sentence is the same as the last sent elif sent == lastSent: continue else: results.append([word for (word, pos) in sent]) # count number of fillers if len(set(["co", None]).intersection(posList)) > 0: numFillers += posList.count("co") numFillers += posList.count(None) sentDiscount += 1 lastSent = sent try: thisWordList = flatten(results) # count number of morphemes # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes) numWords = ( len(flatten([word.split("-") for word in thisWordList])) - numFillers ) numSents = len(results) - sentDiscount mlu = numWords / numSents except ZeroDivisionError: mlu = 0 # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents} return mlu
def _getMLU(self, fileid, speaker): sents = self._get_words( fileid, speaker=speaker, sent=True, stem=True, relation=False, pos=True, strip_space=True, replace=True, ) results = [] lastSent = [] numFillers = 0 sentDiscount = 0 for sent in sents: posList = [pos for (word, pos) in sent] # if any part of the sentence is intelligible if any(pos == 'unk' for pos in posList): continue # if the sentence is null elif sent == []: continue # if the sentence is the same as the last sent elif sent == lastSent: continue else: results.append([word for (word, pos) in sent]) # count number of fillers if len(set(['co', None]).intersection(posList)) > 0: numFillers += posList.count('co') numFillers += posList.count(None) sentDiscount += 1 lastSent = sent try: thisWordList = flatten(results) # count number of morphemes # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes) numWords = ( len(flatten([word.split('-') for word in thisWordList])) - numFillers ) numSents = len(results) - sentDiscount mlu = numWords / numSents except ZeroDivisionError: mlu = 0 # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents} return mlu
def package(self): n = len(self.sent) vals = [flatten(x) for x in enumerate(self.tri())] return [{ "w": self.sent, "i": i, "y2": y2, "y1": y1, "y": y } for i, y2, y1, y in vals]
def add(self, key, value): if value is not None and len(value) > 0: if not self.contains(key): if len(value) == 1: value = value[0] self[key] = value else: if len(value) == 1: value = value[0] if not value in self[key]: self[key] = flatten(self[key], value)
def _getMLU(self, fileid): sents = self._get_words(fileid, speaker='CHI', sent=True, stem=True, relation=False, pos=True, strip_space=True, replace=True) results = [] lastSent = [] numFillers = 0 for sent in sents: posList = [pos for (word, pos) in sent] # if any part of the sentence is intelligible if any(pos == 'unk' for pos in posList): next # if the sentence is null elif sent == []: next # if the sentence is the same as the last sent elif sent == lastSent: next else: results.append([word for (word, pos) in sent]) # count number of fillers numFillers += posList.count('co') numFillers += posList.count('None') lastSent = sent try: thisWordList = flatten(results) # count number of morphemes (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes) numWords = float( len(flatten([word.split('-') for word in thisWordList]))) - numFillers numSents = float(len(results)) mlu = numWords / numSents except ZeroDivisionError: mlu = 0 # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents} return mlu
def get_paragraph_distances(get_embeddings, title, paragraphs): """Calculates the sorted distances of a question title to a number of paragraphs using sentence embeddings. Uses the min distance over all sentences in a paragraph for sorting. """ title_tok_str = ' '.join(word_tokenize(title)) title_embeddings = get_embeddings([title_tok_str.lower()]) paragraph_sents_lowered = [[ ' '.join(word_tokenize(s)).lower() for s in sent_tokenize(p) ] for p in paragraphs] paragraphs_embeddings = get_embeddings(flatten(paragraph_sents_lowered)) distances = cdist(paragraphs_embeddings, title_embeddings, 'cosine').reshape(-1) distances_per_paragraph = [] sents_processed = 0 for sents in paragraph_sents_lowered: distances_per_paragraph.append( min(distances[sents_processed:sents_processed + len(sents)])) sents_processed += len(sents) return distances_per_paragraph
def frequencies(*seq): seq = flatten(seq) freq = defaultdict(lambda: 0) for i in seq: freq[i] += 1 return freq
def _compare_with_conn(self, current_token, dir_is_left, connective_instances, instance_under_construction): if dir_is_left: arc_direction = 'LEFT' first_uncompared_index = -1 compared = self.lambda_2 uncompared = self.lambda_1 else: arc_direction = 'RIGHT' first_uncompared_index = 0 compared = self.lambda_3 uncompared = self.lambda_4 conn_instance_index = 0 conn_instance = connective_instances[conn_instance_index] other_connective_tokens = set( flatten([i.connective for i in connective_instances[1:]])) other_connective_tokens -= set(conn_instance.connective) last_modified_arc_type = None while uncompared: token_to_compare = uncompared[first_uncompared_index] # First, see if we should split. But don't split on leftward tokens. if (not dir_is_left and token_to_compare in other_connective_tokens and self._last_op != 'SPLIT'): instance_under_construction = self._do_split( current_token, last_modified_arc_type, token_to_compare, instance_under_construction) # Move to next conn_instance_index += 1 conn_instance = connective_instances[conn_instance_index] # Leave current token to be compared with new connective. else: # If there's a fragment, record it first, before looking at the # args. (The fragment word might still be part of an arg.) if (token_to_compare is not current_token and self._last_op not in # no fragments after splits/frags ['SPLIT', "CONN-FRAG-{}".format(arc_direction)] and token_to_compare in conn_instance.connective): self._write_transition( current_token, "CONN-FRAG-{}".format(arc_direction)) instance_under_construction.connective.append( token_to_compare) arcs_to_add = [] for arc_type in ['cause', 'effect', 'means']: argument = getattr(conn_instance, arc_type, None) if argument is not None and token_to_compare in argument: arcs_to_add.append(arc_type) # TODO: This will do odd things if there's ever a SPLIT # interacting with a multiple-argument arc. last_modified_arc_type = arc_type if arcs_to_add: trans = "{}-ARC({})".format( arc_direction, ','.join(arc_type.title() for arc_type in arcs_to_add)) instance_under_construction = self._write_transition( current_token, trans, True, instance_under_construction) for arc_type in arcs_to_add: getattr(instance_under_construction, arc_type).append(token_to_compare) else: instance_under_construction = self._write_transition( current_token, "NO-ARC-{}".format(arc_direction), True, instance_under_construction) if dir_is_left: compared.appendleft(uncompared.pop()) else: compared.append(uncompared.popleft()) return instance_under_construction # make update visible
def __init__(self, docs): self.tuples = flatten([Process(x).hist for x in docs])
def add(self, software_name, tweet): if self.contains(software_name): self[software_name]["tweets"] = flatten(self[software_name]["tweets"], tweet) self[software_name]["weight"] += 1 else: self[software_name] = {"tweets": tweet, "weight": 1}
def package(self): n = len(self.sent) vals = [flatten(x) for x in enumerate(self.tri())] return [{"w":self.sent, "i":i, "y2":y2, "y1":y1, "y":y} for i, y2, y1, y in vals]
def active_features(h, m, l=None): if l: active = flatten([f(h, m, l) for f in features.values()]) else: active = flatten([f(h, m) for f in features.values()]) return [x for x in active if model.has_key(x)]
def score(h, m, l=None): if l: active = flatten([f(h, m, l) for f in features.values()]) else: active = flatten([f(h, m) for f in features.values()]) return sum([model[x] for x in active if model.has_key(x)])
def score(h,m,l=None): if l: active = flatten([f(h,m,l) for f in features.values()]) else: active = flatten([f(h,m) for f in features.values()]) return sum([model[x] for x in active if model.has_key(x)])
def active_features(h,m,l=None): if l: active = flatten([f(h,m,l) for f in features.values()]) else: active = flatten([f(h,m) for f in features.values()]) return [x for x in active if model.has_key(x)]
Texte. Der erste classifier lernt Texte in verschiedene Kategorien einzuordnen Der zweite lernt welche Kategorien der User interessant findet""" # 1500 zufällige Textpassagen aus dem Reuters print 'Load corpus' #corp = reuters.raw() print 'Loaded corpus' #rnd = np.random.randint(0,len(corp)/2,1500) #raw_documents = [corp[i:i+300] for i in rnd] print 'Created docs' pdb.set_trace() corp = brown.paras(categories='hobbies') rnd = np.random.randint(0, len(corp) - 3, 300) raw_documents = [flatten(corp[i:i + 3]) for i in rnd] pdb.set_trace() raw_doc2 = list() for doc in raw_documents: raw_doc2.append(''.join(str(word) + " " for word in doc)) raw_documents = raw_doc2 pdb.set_trace() #posts_j = json.load(open('cogsci.json')) #posts = posts_j.values() #raw_documents = list() #for post in posts: # if post.has_key('message'): # raw_documents.append(post['message']) # max_docs = len(raw_documents)