__author__ = 'matias' from textanalysis.irdatastructs import InvertedIndex from matplotlib import pyplot as plt entity_type = "disease" index = InvertedIndex(entity_type) index.load() ranking = [] for term in index.index: ranking.append((term, len(set(index.index[term])))) ranking.sort(key=lambda tup: tup[1]) count = 1 with open("%s_stopwords.txt" % (entity_type, ), 'w') as outfile: for e in ranking: print count, e if e[1] > 80: outfile.write("%s\n" % (e[0], )) count += 1 print len(ranking) # plot IDF for all entity terms plt.plot([1.0 / tup[1] for tup in ranking[:-1]]) plt.show()
__author__ = 'matias' from textanalysis.irdatastructs import InvertedIndex from matplotlib import pyplot as plt entity_type = "disease" index = InvertedIndex(entity_type) index.load() ranking = [] for term in index.index: ranking.append((term, len(set(index.index[term])))) ranking.sort(key=lambda tup:tup[1]) count = 1 with open("%s_stopwords.txt" % (entity_type,), 'w') as outfile: for e in ranking: print count, e if e[1] > 80: outfile.write("%s\n" % (e[0],)) count += 1 print len(ranking) # plot IDF for all entity terms plt.plot([1.0/tup[1] for tup in ranking[:-1]]) plt.show()
__author__ = 'matias' from textanalysis.entityextractor import DiseaseExtractor, SymptomExtractor from textanalysis.texts import CaseReportLibrary from textanalysis.irdatastructs import InvertedIndex d_index = InvertedIndex("disease") s_index = InvertedIndex("symptom") cases = CaseReportLibrary() d_extractor = DiseaseExtractor() s_extractor = SymptomExtractor() count = 0 max_count = 50000 for case in cases: text = case.get_text() count += 1 symptoms = list(set(s_extractor.extract(text))) diseases = list(set(d_extractor.extract(text))) s_index.add(symptoms,count) d_index.add(diseases,count) if count >= max_count: break print count,"/",max_count print symptoms + diseases s_index.save() d_index.save()
__author__ = 'matias' from textanalysis.entityextractor import DiseaseExtractor, SymptomExtractor from textanalysis.texts import CaseReportLibrary from textanalysis.irdatastructs import InvertedIndex d_index = InvertedIndex("disease") s_index = InvertedIndex("symptom") cases = CaseReportLibrary() d_extractor = DiseaseExtractor() s_extractor = SymptomExtractor() count = 0 max_count = 50000 for case in cases: text = case.get_text() count += 1 symptoms = list(set(s_extractor.extract(text))) diseases = list(set(d_extractor.extract(text))) s_index.add(symptoms, count) d_index.add(diseases, count) if count >= max_count: break print count, "/", max_count print symptoms + diseases s_index.save() d_index.save()