def tagged_words(self, lemmatize=True): words = XMLCorpusView(self.path, '.*/w') if lemmatize: word_tags = [(word.text, word.attrib['pos'], self.get_lemma(word)) for word in words] else: word_tags = [(word.text, word.attrib['pos']) for word in words] return word_tags
def sentences(self): """Returns a list of sentences where each sentence is a list of words """ sents = XMLCorpusView(self.path, '.*/sentence') sent_list = list() for sentence in sents: word_list = [word.text for word in sentence] sent_list.append(word_list) return sent_list
def tagged_sentences(self, lemmatize=True): sents = XMLCorpusView(self.path, '.*/sentence') sent_list = list() for sent in sents: if lemmatize: word_list = [(word.text, word.attrib['pos'], self.get_lemma(word)) for word in sent] else: word_list = [(word.text, word.attrib['pos']) for word in sent] sent_list.append(word_list) return sent_list
doc_2d = [] for doc, file in zip(matrix, filenames): # reduce the data to 2 dimensions #print(file, "\n", doc, "\n\n") # debug msg doc_2d.append(TSNE().fit_transform(doc).tolist()[0]) matrix = np.asarray(doc_2d) # update matrix array # raw output np.savetxt('lsa_reduced.csv', matrix, delimiter='\t') # raw output # build list of tags from the metadata metadata = pd.DataFrame(index=filenames, columns=['Tags']) view = XMLCorpusView('txt/export-abstracts.xml', '.*/article') iter = view.iterate_from(0) for entry in iter: metadata.loc[entry.attrib['{http://www.w3.org/XML/1998/namespace}id']+'.txt', 'Tags'] = entry.attrib['type'] metadata.to_csv('lsa_metadata.csv') ############################################################################## # CLUSTERING print("clustering ...\n") #af = AffinityPropagation(damping=0.9, affinity="euclidean", preference=-50).fit(matrix) af = AffinityPropagation().fit(matrix) # default