occurences = {}
    for w in words:
        occurences[w] = 0

    for w in text2words.text_to_words(s):
        if w in occurences:
            occurences[w] += 1

    return [occurences[w] for w in words]

def convert_records_to_words(records, words):
    """Convert list of records into list of words counts"""
    for rec in records:
            title = rec['ti']
            descr = rec['descr']
            kw = " ".join(rec['kw'])

            feats = calc_word_feats(" ".join([title, descr, kw]), words)
            yield (feats, rec['categories'])

if __name__ == '__main__':
    words = select_descriptive_words.select_descriptive_words_quotientmethod(
        sys.argv[1], sys.argv[2], int(sys.argv[3]), float(sys.argv[4]))
    for i in convert_records_to_words(
            record_read.read_list_records(sys.argv[4]), words):
        print i
from classifier_tree import TreeSingleTagWordsClassifier
from mainleave1out_biggestcategory_svm import extract_most_common_categ

if __name__ == '__main__':
    #read words that are most important:
    extr_fromfname = sys.argv[1]
    basefname = sys.argv[2]
    words_count = int(sys.argv[3])
    thresh_div = float(sys.argv[4])
    records_file = sys.argv[5]
    test_samples = int(sys.argv[6])
    print "Arguments read:"
    print "extr_fromfname =", extr_fromfname
    print "basefname =", basefname
    print "words_count =", words_count
    print "thresh_div =", thresh_div
    print "records_file =", records_file
    print "test_samples =", test_samples
    words = select_descriptive_words.select_descriptive_words_quotientmethod(extr_fromfname, basefname, words_count, thresh_div)
    #read records and convert them into feature-vectors:
    frecords = list(records_to_words_weights_converter.convert_records_to_words(record_read.read_list_records(records_file), words))
    #create frecors with numerical etiquettes:
    #build multi-label-SVM based on this data:
    most_common_categ, max_cnt = extract_most_common_categ(frecords)
    print "Most common category is:", most_common_categ, " with ", max_cnt, " occurences."
    loo = LeaveOneOut(lambda samples: TreeSingleTagWordsClassifier(most_common_categ, samples, featurenames=words), frecords, lambda x: [int(most_common_categ in x[1])])
    corr = loo.test(test_samples)
    print "Correctness:", corr
from wordsfreq import select_descriptive_words
from features import records_to_words_weights_converter
from zbl2py import record_read
from classifier_tester import LeaveOneOutAllCategories
from classifier_svm import SvmWordsClassifier

if __name__ == '__main__':
    #read words that are most important:
    extr_fromfname = sys.argv[1]
    basefname = sys.argv[2]
    words_count = int(sys.argv[3])
    thresh_div = float(sys.argv[4])
    records_file = sys.argv[5]
    test_samples = int(sys.argv[6])
    print "Arguments read:"
    print "extr_fromfname =", extr_fromfname
    print "basefname =", basefname
    print "words_count =", words_count
    print "thresh_div =", thresh_div
    print "records_file =", records_file
    print "test_samples =", test_samples
    words = select_descriptive_words.select_descriptive_words_quotientmethod(extr_fromfname, basefname, words_count, thresh_div)
    #read records and convert them into feature-vectors:
    frecords = list(records_to_words_weights_converter.convert_records_to_words(record_read.read_list_records(records_file), words))
    #create frecors with numerical etiquettes:
    #build multi-label-SVM based on this data:
    loo = LeaveOneOutAllCategories(SvmWordsClassifier, frecords)
    corr = loo.test(test_samples)
    print "Correctness:", corr
def calc_word_feats(s, words):
    """Calculate number of occurences of words in s"""
    occurences = {}
    for w in words:
        occurences[w] = 0
    for w in text2words.text_to_words(s):
        if w in occurences:
            occurences[w] += 1
    return [occurences[w] for w in words]

def convert_records_to_words(records, words):
    """Convert list of records into list of words counts"""
    for rec in records:
            title = rec['ti']
            descr = rec['descr']
            kw = " ".join(rec['kw'])
            feats = calc_word_feats(" ".join([title, descr, kw]), words)
            yield (feats, rec['categories'])

if __name__ == '__main__':
    words = select_descriptive_words.select_descriptive_words_quotientmethod(sys.argv[1], sys.argv[2], int(sys.argv[3]), float(sys.argv[4]))
    for i in convert_records_to_words(record_read.read_list_records(sys.argv[4]), words):
        print i