def extract_(self, post, topic, features=[]): if len(features) == 0: features = _featlists.keys() else: features = filter(lambda x: x in _featlists, features) spans = [] # span := [ span-id, [ start, stop, { "category": (major, minor), "environment_indicators": [ (word, occurrence_id) ] } ] ] for featpair in features: (word_pairs, phrase_pairs) = _featlists[featpair] occ_i = 0 for word, rx in word_pairs: position = 0 for sentence in re.split(r'\.!?', post.text): match = rx.match(post.text) if match: spans.append( [ "%s-%s" % (str(post.id), len(spans)+1), [ position + match.start(), position + match.end(), { "category": featpair, "environment_indicators": [ (word, str(occ_i)) ] } ] ] ) position += len(sentence) occ_i += 1 occ_i = 0 for phrase, rx in phrase_pairs: for m in rx.finditer(post.text): spans.append( [ "%s-%s" % (str(post.id), len(spans)+1), [ m.start(), m.end(), { "category": featpair, "environment_indicators": [ (m.group(0), -1) ] } ] ] ) occ_i += 1 for span in set([d[1][2]['category'] for d in spans]): self.freq[span] += 1 self.by_topic[topic][span] += 1 if len(self.by_topic[topic].keys()) < 2: return #self.by_side[topic][post.topic_side][span] += 1 tuples = [] for span in spans: start, stop, name = span[1][0], span[1][1], '-'.join(span[1][2]['category']) tuples.append((start,stop,name)) b = Boundaries() b.initializeFromTuples(tuples) if len(b.boundaries) == 0: return self.by_topic[topic][POSTS_KEY] += 1 b.walk(1, max(tuples, key=operator.itemgetter(1))) feature_vector = defaultdict(int) for partition in b.partitions[:-1]: unigrams = map(lambda unigram: self.stemmer.stem(unigram.lower()), re.split(r'\W', post.text[partition[0]:partition[1]])) for unigram in unigrams: feature_vector['{}:{}'.format(partition[2], unigram)] += 1 return feature_vector
def extract(self, post, topic, features=[]): if len(features) == 0: features = _featlists.keys() else: features = filter(lambda x: x in _featlists, features) spans = [] # span := [ span-id, [ start, stop, { "category": (major, minor), "environment_indicators": [ (word, occurrence_id) ] } ] ] for featpair in features: (word_pairs, phrase_pairs) = _featlists[featpair] occ_i = 0 for word, rx in word_pairs: position = 0 for sentence in re.split(r'\.!?', post.text): match = rx.match(post.text) if match: spans.append( [ "%s-%s" % (str(post.id), len(spans)+1), [ position + match.start(), position + match.end(), { "category": featpair, "environment_indicators": [ (word, str(occ_i)) ] } ] ] ) position += len(sentence) occ_i += 1 occ_i = 0 for phrase, rx in phrase_pairs: for m in rx.finditer(post.text): spans.append( [ "%s-%s" % (str(post.id), len(spans)+1), [ m.start(), m.end(), { "category": featpair, "environment_indicators": [ (m.group(0), -1) ] } ] ] ) occ_i += 1 self.by_topic[topic][POSTS_KEY] += 1 environments = set() for span in set([d[1][2]['category'] for d in spans]): self.freq[span] += 1 self.by_topic[topic][span] += 1 self.by_side[topic][post.topic_side][span] += 1 environments.add(span[0]) #XXX TODO need to make sure that the environments we are checking for are in the top three as far as probabiltiy goes self.environments_topic[topic].update([len(environments)]) if len(environments) < 2: return #print 'throwing post away. discussion: {discussion} id: {id}'.format(**{'discussion': topic, 'id': post.id}) tuples = [] for span in spans: start, stop, name = span[1][0], span[1][1], '-'.join(span[1][2]['category']) tuples.append((start,stop,name)) b = Boundaries() b.initializeFromTuples(tuples) if len(b.boundaries) == 0: return b.walk(1, max(tuples, key=operator.itemgetter(1))) #print 'boundaries:{boundary.boundaries}\nparititions:{boundary.partitions}'.format(boundary=b) fv_all = defaultdict(int) fv_collapsed = defaultdict(int) fv_commitment = defaultdict(int) tokens = 0 for partition in b.partitions[:-1]: unigrams = map(lambda unigram: self.stemmer.stem(unigram.lower()), re.split(r'\W', post.text[partition[0]:partition[1]])) tokens += len(unigrams) for _label in set(partition[2].split()): for unigram in unigrams: fv_commitment['{}:{}'.format(_label, unigram)] += 1 if _label == 'none': fv_collapsed['commitment:{unigram}'.format(unigram=unigram)] += 1 else: fv_collapsed['non_commitment:{unigram}'.format(unigram=unigram)] += 1 for unigram in unigrams: fv_all['unigram_{unigram}'.format(unigram=unigram)] += 1 for key in fv_all.keys(): fv_all[key] /= float(tokens) for key in fv_commitment.keys(): fv_commitment[key] /= float(tokens) for key in fv_collapsed.keys(): fv_collapsed[key] /= float(tokens) fv_all[self.label] = post.topic_side fv_commitment[self.label] = post.topic_side fv_collapsed[self.label] = post.topic_side return {'all': fv_all, 'collapsed': fv_collapsed, 'commitment': fv_commitment}