def documents2feature_vectors(documents): vectorizer = PolitenessFeatureVectorizer() fks = False X, y = [], [] for d in documents: fs = vectorizer.features(d) if not fks: fks = sorted(fs.keys()) fv = [fs[f] for f in fks] # If politeness score > 0.0, # the doc is polite, class=1 l = 1 if d['score'] > 0.0 else 0 X.append(fv) y.append(l) X = csr_matrix(np.asarray(X)) y = np.asarray(y) return X, y
def get_features(requests): vectorizer = PolitenessFeatureVectorizer() fks = False X, y = [], [] for req in requests: # get unigram, bigram features + politeness strategy features # in this specific document # vectorizer returns {feature-name: bool_value} dict # a matrix of zeros and ones fs = vectorizer.features(req) if not fks: fks = sorted(fs.keys()) # get features vector fv = [fs[k] for k in fks] # If politeness score > 0.0, # the doc is polite, class = 1 if req['score'] > 0.0: l = 1 else: l = 0 X.append(fv) y.append(l) # Single-row sparse matrix # where np.asarray converts the input to an array. #X = csr_matrix(np.asarray(X)) X = np.asarray(X) # format y = np.asarray(y) y_ = np.zeros((len(y), 2)) for i in range(len(y)): if y[i] == 1: y_[i][1] = 1 else: y_[i][0] = 1 y = y_ return X, y
get_parses("That is weird. Why can't you just store the \"Range\"?")) TEST_DOCUMENTS.append( get_parses( "This was supposed to have been moved to <url> per the cfd. why wasn't it moved?" )) TEST_DOCUMENTS.append(get_parses("You are wrong. But the approach is correct")) # TEST_DOCUMENTS.append(get_parses("")) pp = pprint.PrettyPrinter(indent=4) fks = False for each in TEST_DOCUMENTS: # print each['sentences'] # print fs = vectorizer.features(each) for feature, score in fs.items(): if score == 1: if (feature.startswith('feature_politeness')): # print feature if ('==Direct_question==' in feature): each['sentences'][0] = "Sorry but " + each['sentences'][0] # if ('==2nd_person==' in feature): # li = [] # for i in each['sentences'][0].split(" "): # y = 'we' if i.lower() == 'you' else i # li.append(y) # each['sentences'] = sent_tokenize(' '.join(li)) if ('==Direct_start==' in feature): li = each['sentences'][0].split(" ") li[0] = 'Do'