def print_codes_distribution(records): categs = defaultdict(lambda: 0) for rec in records: for c in rec['categories']: categs[c]+=1 #print sorted categories: categs_occurences = defaultdict(lambda: 0) for _, v in categs.iteritems(): categs_occurences[v]+=1 msc_codes = sorted(list(categs_occurences.iteritems()), key=lambda x:x[1]) for cd in msc_codes: print "|", cd[0], "||", cd[1] print "|-" def print_docs_distribution(records): docs = defaultdict(lambda: 0) for rec in records: docs[len(rec['categories'])]+=1 #print sorted categories: doc_distr = sorted(list(docs.iteritems()), key=lambda x:x[1]) for cd in doc_distr: print "|", cd[0], "||", cd[1] print "|-" if __name__=="__main__": records = record_read.read_list_records(sys.argv[1]) print_codes_distribution(records) print_docs_distribution(records)
occurences = {} for w in words: occurences[w] = 0 for w in text2words.text_to_words(s): if w in occurences: occurences[w] += 1 return [occurences[w] for w in words] def convert_records_to_words(records, words): """Convert list of records into list of words counts""" for rec in records: try: title = rec['ti'] descr = rec['descr'] kw = " ".join(rec['kw']) feats = calc_word_feats(" ".join([title, descr, kw]), words) yield (feats, rec['categories']) except: continue if __name__ == '__main__': words = select_descriptive_words.select_descriptive_words_quotientmethod( sys.argv[1], sys.argv[2], int(sys.argv[3]), float(sys.argv[4])) for i in convert_records_to_words( record_read.read_list_records(sys.argv[4]), words): print i
from wordsfreq import select_descriptive_words from features import records_to_words_weights_converter from zbl2py import record_read from classifier_tester import LeaveOneOutAllCategories from classifier_svm import SvmWordsClassifier if __name__ == '__main__': #read words that are most important: extr_fromfname = sys.argv[1] basefname = sys.argv[2] words_count = int(sys.argv[3]) thresh_div = float(sys.argv[4]) records_file = sys.argv[5] test_samples = int(sys.argv[6]) print "Arguments read:" print "extr_fromfname =", extr_fromfname print "basefname =", basefname print "words_count =", words_count print "thresh_div =", thresh_div print "records_file =", records_file print "test_samples =", test_samples words = select_descriptive_words.select_descriptive_words_quotientmethod(extr_fromfname, basefname, words_count, thresh_div) #read records and convert them into feature-vectors: frecords = list(records_to_words_weights_converter.convert_records_to_words(record_read.read_list_records(records_file), words)) #create frecors with numerical etiquettes: #build multi-label-SVM based on this data: loo = LeaveOneOutAllCategories(SvmWordsClassifier, frecords) corr = loo.test(test_samples) print "Correctness:", corr
categs = defaultdict(lambda: 0) for rec in records: for c in rec['categories']: categs[c] += 1 #print sorted categories: categs_occurences = defaultdict(lambda: 0) for _, v in categs.iteritems(): categs_occurences[v] += 1 msc_codes = sorted(list(categs_occurences.iteritems()), key=lambda x: x[1]) for cd in msc_codes: print "|", cd[0], "||", cd[1] print "|-" def print_docs_distribution(records): docs = defaultdict(lambda: 0) for rec in records: docs[len(rec['categories'])] += 1 #print sorted categories: doc_distr = sorted(list(docs.iteritems()), key=lambda x: x[1]) for cd in doc_distr: print "|", cd[0], "||", cd[1] print "|-" if __name__ == "__main__": records = record_read.read_list_records(sys.argv[1]) print_codes_distribution(records) print_docs_distribution(records)
from classifier_tree import TreeSingleTagWordsClassifier from mainleave1out_biggestcategory_svm import extract_most_common_categ if __name__ == '__main__': #read words that are most important: extr_fromfname = sys.argv[1] basefname = sys.argv[2] words_count = int(sys.argv[3]) thresh_div = float(sys.argv[4]) records_file = sys.argv[5] test_samples = int(sys.argv[6]) print "Arguments read:" print "extr_fromfname =", extr_fromfname print "basefname =", basefname print "words_count =", words_count print "thresh_div =", thresh_div print "records_file =", records_file print "test_samples =", test_samples words = select_descriptive_words.select_descriptive_words_quotientmethod(extr_fromfname, basefname, words_count, thresh_div) #read records and convert them into feature-vectors: frecords = list(records_to_words_weights_converter.convert_records_to_words(record_read.read_list_records(records_file), words)) #create frecors with numerical etiquettes: #build multi-label-SVM based on this data: most_common_categ, max_cnt = extract_most_common_categ(frecords) print "Most common category is:", most_common_categ, " with ", max_cnt, " occurences." loo = LeaveOneOut(lambda samples: TreeSingleTagWordsClassifier(most_common_categ, samples, featurenames=words), frecords, lambda x: [int(most_common_categ in x[1])]) corr = loo.test(test_samples) print "Correctness:", corr
if __name__ == '__main__': #read words that are most important: extr_fromfname = sys.argv[1] basefname = sys.argv[2] words_count = int(sys.argv[3]) thresh_div = float(sys.argv[4]) records_file = sys.argv[5] k = int(sys.argv[6]) print "Arguments read:" print "extr_fromfname =", extr_fromfname print "basefname =", basefname print "words_count =", words_count print "thresh_div =", thresh_div print "records_file =", records_file print "k =", k records = record_read.read_list_records(records_file) words = select_descriptive_words.select_descriptive_words_keywords(records) #words = select_descriptive_words.select_descriptive_words_quotientmethod(extr_fromfname, basefname, words_count, thresh_div) #read records and convert them into feature-vectors: frecords = list(records_to_words_weights_converter.convert_records_to_words(records, words)) #create frecors with numerical etiquettes: #build multi-label-SVM based on this data: most_common_categ, max_cnt = extract_most_common_categ(frecords) print "Most common category is:", most_common_categ, " with ", max_cnt, " occurences." kf = KFold(lambda samples: TreeSingleTagWordsClassifier(most_common_categ, samples, featurenames=words), frecords, lambda x: [int(most_common_categ in x[1])], k) corr = kf.test() print "Correctness:", corr
for c, v in categs.iteritems(): if v<thresh_categs_count: to_del.append(c) for c in to_del: categs.pop(c) print "len of categs after filtering rare ones:", len(categs) #delete rare categories from records: r_to_del = [] for rec in records: to_del = [] for c in rec[category_field_name]: if c not in categs: to_del.append(c) for c in to_del: rec[category_field_name].remove(c) if len(rec[category_field_name])==0: r_to_del.append(rec) #delete rare records: print "len of records before filtering those without codes:", len(records) for rec in r_to_del: records.remove(rec) print "len of records after filtering those without codes:", len(records) return records if __name__=="__main__": import sys sys.path.append(r'../') from zbl2py import record_read, record_store records = filter_out_rare_codes_records(record_read.read_list_records(sys.argv[1]), int(sys.argv[3])) record_store.store_py_records(records, sys.argv[2])
from classifier_tester import LeaveOneOutAllCategories from classifier_svm import SvmWordsClassifier if __name__ == '__main__': #read words that are most important: extr_fromfname = sys.argv[1] basefname = sys.argv[2] words_count = int(sys.argv[3]) thresh_div = float(sys.argv[4]) records_file = sys.argv[5] test_samples = int(sys.argv[6]) print "Arguments read:" print "extr_fromfname =", extr_fromfname print "basefname =", basefname print "words_count =", words_count print "thresh_div =", thresh_div print "records_file =", records_file print "test_samples =", test_samples words = select_descriptive_words.select_descriptive_words_quotientmethod( extr_fromfname, basefname, words_count, thresh_div) #read records and convert them into feature-vectors: frecords = list( records_to_words_weights_converter.convert_records_to_words( record_read.read_list_records(records_file), words)) #create frecors with numerical etiquettes: #build multi-label-SVM based on this data: loo = LeaveOneOutAllCategories(SvmWordsClassifier, frecords) corr = loo.test(test_samples) print "Correctness:", corr
def calc_word_feats(s, words): """Calculate number of occurences of words in s""" occurences = {} for w in words: occurences[w] = 0 for w in text2words.text_to_words(s): if w in occurences: occurences[w] += 1 return [occurences[w] for w in words] def convert_records_to_words(records, words): """Convert list of records into list of words counts""" for rec in records: try: title = rec['ti'] descr = rec['descr'] kw = " ".join(rec['kw']) feats = calc_word_feats(" ".join([title, descr, kw]), words) yield (feats, rec['categories']) except: continue if __name__ == '__main__': words = select_descriptive_words.select_descriptive_words_quotientmethod(sys.argv[1], sys.argv[2], int(sys.argv[3]), float(sys.argv[4])) for i in convert_records_to_words(record_read.read_list_records(sys.argv[4]), words): print i