def confusion_matrix(self, key=None, output_format=None, split=False): """Returns a confusion matrix for the model based on splitting the data set randomly into two pieces, training on one and testing on the other""" if split: list_of_dependent = self.dependent_in_use(key=key) else: list_of_dependent = [None] output = '' matrices = dict() for current_dep in list_of_dependent: testing_set = list() model = self._learner() for record in self.classified_entries(key=key): if split: dep_result = str(record.dependent == current_dep) else: dep_result = record.dependent if random.random() < 0.5: model.train(Document(record.independent.lower(), stemmer=PORTER), dep_result) else: testing_set.append((Document(record.independent.lower(), stemmer=PORTER), dep_result)) matrix = model.confusion_matrix(documents=testing_set) matrices[current_dep] = matrix if output_format == 'html': if split: output += '<h4>' + current_dep + "</h4>" vals = matrix.keys() output += '<table class="table table-bordered"><thead><tr><td></td><td></td><td style="text-align: center" colspan="' + str(len(vals)) + '">Actual</td></tr><tr><th></th><th></th>' first = True for val in vals: output += '<th>' + val + '</th>' output += '</tr></thead><tbody>' for val_a in vals: output += '<tr>' if first: output += '<td style="text-align: right; vertical-align: middle;" rowspan="' + str(len(vals)) + '">Predicted</td>' first = False output += '<th>' + val_a + '</th>' for val_b in vals: output += '<td>' + str(matrix[val_b].get(val_a, 0)) + '</td>' output += '</tr>' output += '</tbody></table>' #output += "\n\n`" + str(matrix) + "`" # output += '<ul>' # for document, actual in testing_set: # predicted = model.classify(document) # output += '<li>Predicted: ' + predicted + '; Actual: ' + actual + '</li>' # output += '</ul>' if output_format == 'html': return output if split: ret_val = matrices else: ret_val = matrices[None] if output_format == 'json': return json.dumps(ret_val, sort_keys=True, indent=4) if output_format == 'yaml': return yaml.safe_dump(ret_val, default_flow_style=False) if output_format is None: return ret_val return ret_val
def setup(): global pages global urlalias global revurlalias global knn pages = dict() urlalias = dict() revurlalias = dict() knn = KNN() db = MySQLdb.connect(host="192.168.200.26", user="******", passwd="xxxsecretxxx", db="pla") cur = db.cursor() cur.execute("select source, alias from url_alias") for row in cur.fetchall(): urlalias[row[1]] = row[0] revurlalias[row[0]] = row[1] cur.execute("select tid, name, description, vid from taxonomy_term_data;") for row in cur.fetchall(): url = 'taxonomy/term/' + str(row[0]) pages[url] = row[1] if url in revurlalias: pages[revurlalias[url]] = row[1] url = revurlalias[url] if row[3] == 3: soup = bs4.BeautifulSoup(row[2]) the_text = re.sub(r'[\n\r]+', r' ', soup.get_text(' ')).lower() knn.train(Document(the_text, stemmer=PORTER), url) knn.train(Document(row[1].lower()), url) cur.execute( "select a.tid, c.body_value, d.title from taxonomy_term_data as a inner join field_data_field_practice_areas as b on (a.tid=b.field_practice_areas_tid and b.entity_type='node' and b.bundle != 'professionals' and b.deleted=0) inner join field_data_body as c on (b.entity_id=c.entity_id and b.entity_type=c.entity_type) inner join node as d on (c.entity_id=d.nid);" ) for row in cur.fetchall(): url = 'taxonomy/term/' + str(row[0]) if url in revurlalias: url = revurlalias[url] soup = bs4.BeautifulSoup(row[1]) the_text = re.sub(r'[\n\r]+', r' ', soup.get_text(' ')).lower() knn.train(Document(the_text, stemmer=PORTER), url) knn.train(Document(row[2].lower()), url) cur.execute("select nid, title from node where status=1;") for row in cur.fetchall(): url = 'node/' + str(row[0]) pages[url] = row[1] if url in revurlalias: pages[revurlalias[url]] = row[1] db.close() pgcur = conn.cursor() pgcur.execute( "select query, target from website_queries where target is not null group by query, target" ) for row in pgcur.fetchall(): words = re.split(r'[\n\r,;]+ *', row[1]) for word in words: print("training on " + row[0].lower() + " for " + word) knn.train(Document(row[0].lower()), word) conn.commit() pgcur.close()
def resolve_certainty(certainty_info): '''Resolve certainty with Naive Bayes''' if certainty_info == '': return 'No certainty info.' else: nb = NB() for observation, certainty in csv( 'library/templatetags/c_training_data.csv'): v = Document(observation, type=int(certainty), stopwords=True) nb.train(v) return nb.classify(Document(certainty_info))
def evaluate_query(query): probs = dict() for key, value in knn.classify(Document(query), discrete=False).iteritems(): probs[key] = value if not len(probs): probs[knn.classify(Document(query))] = 1.0 seen = set() probs = map(lambda x: fixurl(x, seen), sorted(probs, key=probs.get, reverse=True)) probs = [prob for prob in probs if prob is not None] return probs
def nnps_and_keywords(text): s = parsetree(text, relations=True, lemmata=True) nnp_kw = {} for e in s: d = Document(e) kw = d.keywords() nnp = set() for w in kw: if w[1].type == 'NNP': wdstr = [] for wd in w[1].phrase.words: if wd.type == 'NNP': wdstr.append(wd.string) nnp.add("-".join(wdstr)) kw = d.keywords(top=5) words = set() for w in kw: if w[1].type != 'NNP': if w[1].lemma: words.add(w[1].lemma) else: words.add(w[1].string) if len(nnp)>1 and len(words)>1: if tuple(nnp) in nnp_kw: nnp_kw[tuple(nnp)].update(words) else: nnp_kw[tuple(nnp)]=words return nnp_kw
def crearDocumentoPattern(self, contenido, name=""): '''Creacion de documentos eliminando stopwords, aplicando stemming y peso de frecuencias TFIDF''' return Document(contenido, name=name, stemmer=PORTER, stopwords=True, weigth=TFIDF)
def summarize(text, n=1): """ extract most relevant sentences from text according to TextRank algorithm - text: string consisting of a few sentences - n: number of sentences to extract """ # tokenize text to sentences list sentences = tokenize(text) # create documents list # stop words and punctuation erase by default docs = [Document(sentences[i], name=i) for i in range(len(sentences))] # model initialize m = Model(docs, weight=TFIDF) # dict of TextRank ranking of cosine similarity matrix ranking = utils.textrank(m.documents, m.distance) # indexes of top n sentences top_sents_idx, _ = list(zip(*ranking.most_common(n))) # reordering output = [sentences[i] for i in sorted(top_sents_idx)] return ''.join(output)
def predict(self, indep, probabilities=False): """Returns a list of predicted dependent variables for a given independent variable.""" indep = re.sub(r'[\n\r]+', r' ', indep).lower() if not self._train_from_db(): return list() probs = dict() for key, value in learners[self.group_id].classify(Document(indep.lower(), stemmer=PORTER), discrete=False).iteritems(): probs[key] = value if not len(probs): single_result = learners[self.group_id].classify(Document(indep.lower(), stemmer=PORTER)) if single_result is not None: probs[single_result] = 1.0 if probabilities: return [(x, probs[x]) for x in sorted(probs.keys(), key=probs.get, reverse=True)] else: return sorted(probs.keys(), key=probs.get, reverse=True)
def _train(self, indep, depend): """Trains the machine learner given an independent variable and a corresponding dependent variable.""" if indep is None: return the_text = re.sub(r'[\n\r]+', r' ', indep).lower() learners[self.group_id].train( Document(the_text.lower(), stemmer=PORTER), depend)
def word_ranking(text, n='L2'): """ extract most relevant sentences from text according to LSA algorithm steps: 1. tokenize text by sentences 2. compute tfidf matrix 3. applying SVD of tfidf matrix (reduce to n-dimensions) 4. ranking sentences according to cross-method (source: http://www.aclweb.org/anthology/C10-1098.pdf) - text: string consisting of a few sentences - n: number of sentences to extract """ # tokenize text to sentences list sentences = tokenize(text) #============================================================================== # #synctatic filter # exclude_list = [] # for sent in sentences: # for word, pos in tag(sent): # if pos != "JJ" or pos != 'NN': # Retrieve all adjectives and nouns. # exclude_list.append(word.lower()) #============================================================================== # create documents list # stop words and punctuation erase by default docs = [Document(sentences[i], name=i) for i in range(len(sentences))] # model initialize m = Model(docs, weight=TFIDF) # dimensions number equal to euclidean norm of singular values # U, S, Vt = np.linalg.svd(m.vectors, full_matrices=False) # dimensions=int(round(np.linalg.norm(S, 2))) m.reduce(dimensions=n) # sentences selection according to cross-method # source: http://www.ceng.metu.edu.tr/~e1395383/papers/TextSummarizationUsingLSA(Journal).pdf # topic(rows) x tokens(cols) matrix(tfidf) V = np.array(m.lsa.vt) # average sentence score for each concept/topic by the rows of the Vt matrix avg_score = np.mean(V, axis=1).reshape((-1, 1)) # cell values which are less than or equal to the average score are set to zero V[V <= avg_score] = 0.0 # sigma natrix after svd performing S = np.array(m.lsa.sigma).reshape((-1, 1)) # total length of each sentence vector length = np.sum(V * S, axis=0) # ranking words by length score ranking = Counter(dict(zip(m.lsa.terms, length))) #.most_common(n) #words, score = list(zip(*ranking)) return ranking
def articles_to_trends(articles): news = {} for story in articles: if story['added_at']: article_text = get_article_text(story['url']) d, s = timestamptext(story['added_at'], article_text) # Each key in the news dictionary is a date: news is grouped per day. # Each value is a dictionary of id => story items. # We use hash(story['summary']) as a unique id to avoid duplicate # content. news.setdefault(d, {})[hash(s)] = s m = Model() for date, stories in news.items(): s = stories.values() s = ' '.join(s).lower() # Each day of news is a single document. # By adding all documents to a model we can calculate tf-idf. m.append(Document(s, stemmer=LEMMA, exclude=[ 'news', 'day'], name=date)) for document in m: print document.name print document.keywords(top=10)
def summarize(text_to_summarize): stokens = tokenize(text_to_summarize) # STEP 1 # pattern.vector's Document is a nifty bag-o-words structure, # with a TF weighting scheme docs = [Document(string= s, name=e,stemmer=LEMMA) for e,s in enumerate(stokens) if len(s.split(" ")) > 7] linkgraph = [] # STEP 2 and 3 happen interwovenly for doc in docs: for doc_copy in docs: if doc.name != doc_copy.name: # STEP 2 happens here wordset_a = [x[1] for x in doc.keywords()] wordset_b = [y[1] for y in doc_copy.keywords()] jacc_dist = distance.jaccard(wordset_a, wordset_b) if jacc_dist < 1: linkgraph.append((str(doc.name), #index to sentence str(doc_copy.name),1-jacc_dist)) #dist. score # By the time we reach here, we'd have completed STEP 3 # STEP 4 #I referenced this SO post for help with pagerank'ing #http://stackoverflow.com/questions/9136539/how-to-weighted-edges-affect-pagerank-in-networkx D=nx.DiGraph() D.add_weighted_edges_from(linkgraph) pagerank = nx.pagerank(D) sort_pagerank = sorted(pagerank.items(),key=operator.itemgetter(1)) sort_pagerank.reverse() top2 = sort_pagerank[:2] orderedtop2 = [int(x[0]) for x in top2] orderedtop2 = sorted(orderedtop2) return " ".join([ stokens[i] for i in orderedtop2 ])
def feeds_to_trends(feeds): for url in feeds: url = url['feed_url'] news = {} try: for story in Newsfeed().search(url, cached=False): d, s = datetext(story.date, story.description) # Each key in the news dictionary is a date: news is grouped per day. # Each value is a dictionary of id => story items. # We use hash(story.description) as a unique id to avoid duplicate # content. news.setdefault(d, {})[hash(s)] = s m = Model() for date, stories in news.items(): s = stories.values() s = ' '.join(s).lower() # Each day of news is a single document. # By adding all documents to a model we can calculate tf-idf. m.append(Document(s, stemmer=LEMMA, exclude=[ 'news', 'day'], name=date)) for document in m: print document.name print document.keywords(top=10) except HTTP404NotFound: print url pass
def get_keywords_article(article): tagged_content_words = ([ i.Word for i in article.tagged_content if i.Tag.startswith('NN') ]) d = Document(tagged_content_words) k = d.keywords(top=5) article.keywords = k
def build_model(results=[]): documents = [ Document(i.get('text'), name=i.get('url'), description=i.get('index'), stemmer=LEMMA) for i in results ] m = Model(documents, weight=TFIDF) y, x = 1, len(m.features) model = np.zeros((y, x)) sentence_dict = {} model_sentences = [] for i_index, i in enumerate(documents): sentences = sent_tokenize(results[i_index].get('text').lower()) dy, dx = len(sentences), x for s_index, s in enumerate(sentences): s_words = { w: 1 for w in words(s, stemmer=LEMMA, stopwords=False) if not stopwords_hash.get(w) } if len(s_words) < 5: continue model_sentences.append(s) model = np.append( model, [[1 if s_words.get(w) else 0 for w in m.features]], 0) sentence_dict[model.shape[0] - 1] = i.name # model_sentences[model.shape[0]-1] = s model = np.delete(model, (0), 0) return model, m, model_sentences, sentence_dict
def doclist_from_feeds(feeds): titles = gettitles(feeds) documents = [] for key in titles: doc = Document(" ".join(titles[key]), stemmer=LEMMA, threshold=0) documents.append(doc) return documents
def get_labeled_feats(self, data): labeled_binary = [] for (word, tag) in data: feat = FeatExtract( word, ArtOrDet=(self.error_tag == 'ArtOrDet')).binary_features() d = Document(feat, type=tag, stopwords=True) labeled_binary.append(d) return labeled_binary
def getMod(): essay_path = 'essays/original/' files = fio.recGetTextFiles(path.abspath(essay_path)) docs = [] for f in files: with io.open(f, 'r', encoding='utf-8') as w: text = TextBlob(PageParser.parse(w.read())) text = ' '.join([ word for word in text.words if word not in cachedStopWords ]).lstrip() #ent_text = ' '.join(er.recognize_entities(text.sentences)) #ent_text = PageParser.parse(w.read()) docs.append(Document(text, name=f, top=40)) m = Model(docs) lsa = m.reduce(5) return lsa # Clustering could be a useful technique, commenting out for now #with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w: # write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "") with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w: for i, concept in enumerate(m.lsa.concepts): print("Concept {0}:".format(i)), w.write(unicode("Concept {0}:".format(i))) count = 0 # Show top only first 5 features we come across for feature, weight in m.lsa.concepts[i].items(): if abs(weight) > 0.2: print(feature), w.write(feature + " ") count += 1 if count > 5: break w.write(unicode('\n')) #print cat_docs = [] for d in m.documents: cat = (0, 0, {}) #print d.name.split('\\')[-1] for idx, weight in m.lsa.vectors[d.id].items(): print "\tCat {0}: {1}".format(idx, weight) if abs(weight) > abs(cat[1]) or cat[1] == 0: cat = (idx, weight, d) if cat[0] == i: cat_docs.append(cat) #print "\t{0}".format(d.name.split('\\')[-1]) cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True) for cat, weight, d in cat_docs: f = d.name.split('\\')[-1] w.write( unicode("\t{0} - {1}\n").format( filter(lambda x: x in string.printable, f), weight))
def asDocumentReview(data): ''' a function that converts list of reviews to Documents to be used by Pattern ''' data = [(r['review/text'], float(r['review/score'])) for r in data] data = [ Document(review, type=rating, stopwords=True) for review, rating in data ] return data
def asDocumentClass(data, classification): ''' a function that converts list of reviews to Documents to be used by Pattern ''' data = [(r['review/text'], str(classification)) for r in data] data = [ Document(review, type=classification, stopwords=True) for review, classification in data ] return data
def insertarDocumento(self, url, contenido): """ Crea registro en mongodb y un archivo Pattern Document""" unDocumento = Document(contenido, name=url, stopwords=True, stemming=PORTER, weigth=TFIDF) result = self.mongodb.crearDocumento(unDocumento) if result: unDocumento.save("DocumentoPattern/" + str(result.inserted_id)) return unDocumento
def run(self, minePackage): ac = 0.0 #acierto clave ap = 0.0 #acierto positivo an = 0.0 #acierto negativo alpha = 1.00 beta = 0.75 gamma = 0.25 dictionary = open(os.path.dirname(__file__) + "/dictionary.txt", 'r').read() dictionary = Document(dictionary, stemmer=PORTER) clouds = minePackage['clouds'] query = minePackage['searchKeyStemmer'] for cloud in clouds: for n in cloud.graph.nodes(): methodData = cloud.graph.node[n]['methodData'] # document=methodData.getData() # for t in document: # tf=document[t] # if t in query: # print "entroooooooooooooooooo" # ac+=tf # else: # if t in dictionary:#creo que me olvide de hacer stemming a las palabras del diccionario # ap+=tf # else: # an+=tf content = Document(methodData.getContent(), stemmer=PORTER) for doc in content.keywords(top=200, normalized=True): if doc[1] in query: ac += doc[0] else: if doc[1] in dictionary.words: ap += doc[0] else: an += doc[0] if ac + ap + an > 0: cloud.graph.node[n]['weight_WA'] = ( (ac * alpha) + (ap * beta) + (an * gamma)) / (ac + ap + an) else: cloud.graph.node[n]['weight_WA'] = 0
def calculate(self, minePackage): webDocuments = [] query = Document((minePackage['searchKey'])) clouds = minePackage['clouds'] count = UnPack() totalLinks = count.total(clouds) urlContent = UrlToPlainText() step = 0 for cloud in clouds: for n in cloud.graph.nodes(): doc = cloud.graph.node[n]['methodData'] webDocuments.append(Document(doc.getData())) step += 1 m = Model(documents=webDocuments, weight=TFIDF) for cloud in clouds: for n in cloud.graph.nodes(): methodData = cloud.graph.node[n]['methodData'] vector = Document(methodData.getData()) cloud.graph.node[n]['weight_VSM'] = m.similarity( vector, query) #SETEA EL VALOR DE VSM EN EL CLOUD!!!!!!!!!!
def summarize(raw_text): if len(raw_text) == 0: return "" sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') tokens = sentence_tokenizer.tokenize(raw_text.strip()) documents = [] for position, sentence in enumerate(tokens): if len(sentence.split(" ")) > 5: document = Document(string=sentence, name=position, stemmer=LEMMA) if len(document.features) > 0: documents.append(document) edges = [] for document in documents: for other_document in documents: if document.name == other_document.name: continue doc_words = document.features other_doc_words = other_document.features similarity = jaccard_similarity(doc_words, other_doc_words) if similarity > 0: edges.append((document.name, other_document.name, similarity)) graph = networkx.DiGraph() graph.add_weighted_edges_from(edges) page_rank = networkx.pagerank(graph) sorted_ranks = sorted(page_rank.items(), key=operator.itemgetter(1), reverse=True) summary = [] sentence_numbers = [] num_sentences = 3 for i in range(num_sentences): if i < len(sorted_ranks): node = sorted_ranks[i] sentence_numbers.append(node[0]) sentence_numbers = sorted(sentence_numbers) for sentence_number in sentence_numbers: sentence = tokens[sentence_number] summary.append(sentence) if len(summary) == 0: summary.append(tokens[0]) return " ".join(summary)
def extractSentiment(characterSentences): """ Trains a Naive Bayes classifier object with the reviews.csv file, analyzes the sentence, and returns the tone. """ nb = NB() characterTones = defaultdict(list) for review, rating in csv("reviews.csv"): nb.train(Document(review, type=int(rating), stopwords=True)) for key, value in characterSentences.items(): for x in value: characterTones[key].append(nb.classify(str(x))) return characterTones
def create_doc_list(df): ''' Given a dataframe containing an 'id' column and a 'review' column, create a list of documents in Pattern.Vector Document format. Because of how the data is formatted in the dataframe, the id contains an extra quote at the beginning and end of the id which need to be stripped away. ''' print "Creating a list of {} documents".format(len(df)) doc_list = [] for index, row in df.iterrows(): d = Document(row['review'], threshold=1, name=row['id'][1:-1]) doc_list.append(d) return doc_list
def classify(text): predicted_category = Classifications._category.classify(Document(text), discrete=True) predicted_rate = Classifications._rating.classify(Document(text), discrete=True) predicted_rate_nlp = Classifications._rating_nlp.classify( Classifications.selectWords(text), discrete=True) predicted_sentiment_dict = Classifications._sentiment.classify( Classifications.selectWords(text), discrete=False) predicted_sentiment = True if str( sorted(predicted_sentiment_dict.items(), key=operator.itemgetter(1), reverse=True)[1][0]) in ['True', '3.0', '4.0', '5.0' ] else False return { 'text': text, 'rate': predicted_rate, 'category': predicted_category, 'rate_nlp': predicted_rate_nlp, 'positivity': predicted_sentiment }
def extract(): print 'Extracting features from app descriptions...\n' if os.path.exists(OUTPUT_PATH): shutil.rmtree(OUTPUT_PATH) os.makedirs(OUTPUT_PATH) for dir in os.listdir(INPUT_PATH): if not dir.startswith('.'): os.makedirs("{}/{}".format(OUTPUT_PATH, dir)) for file in os.listdir('{}/'.format(INPUT_PATH) + dir): with open('{}/{}/{}'.format(INPUT_PATH, dir, file), 'rb') as f: reader = csv.reader(f) next(reader) with open('{}/{}/{}'.format(OUTPUT_PATH, dir, file), 'wb') as r: writer = csv.writer(r) for app in reader: name = app[0] description = app[2] # Prepare an app description string for NLTK and LDA processing preparedDescription = prepare_description( description) # Extract 3 word featurlets from the description featurelets = featurelet_extraction( preparedDescription) list = [] for feature in featurelets: featurelet = '{} {} {}'.format( feature[0], feature[1], feature[2]) list.append( Document(featurelet, name=featurelet)) # Perform hierarchical clustering m = Model(list) cluster = m.cluster(method=HIERARCHICAL, k=3, iterations=1000, distance=COSINE) # Organize clusters into features and alternative tokens (features, alterTokens) = group(cluster, [], [], []) # Write results to file writer.writerow( [name, description, features, alterTokens]) r.close() f.close()
def get_model_from_documents(path='./*/*.txt'): '''return model from given txt files''' import codecs import glob from pattern.vector import Document, Model, TFIDF documents = [] files = glob.glob('./*/*.*') for file in files: f = codecs.open(file, 'r') data = f.read() document = Document(data) documents.append(document) model = Model(documents=documents, weight=TFIDF) return documents, model
def GetVectors(): essay_path = 'training' files = fio.recGetTextFiles(path.abspath(essay_path)) docs = [] percepticon = PerceptronTagger() cat_dict = defaultdict(int) for f in files: extended_text = ExtendText(f, percepticon) name = '' cats = ['high', 'medium', 'low'] for cat in cats: if cat in f: name = cat + str(cat_dict[cat]) cat_dict[cat] += 1 docs.append(Document(extended_text, name=name, top=None)) m = Model(docs) #lsa = m.reduce(5) return m