return sourceLink

    def readAuthor(self, characters):
        if "".join(characters[0:5]) != "{\\it ":
            raise ValueError("an author link must start with \"{\\it \"")
        
        i = 4
        while i < len(characters) and characters[i] != '}':
            i += 1
        
        authorLink = zbMathTokenizer.Author("".join(characters[5:i]))
        del characters[:i+1]
        return authorLink
    
    def readBracedText(self, characters):
        if characters[0] != '(':
            raise ValueError("first char must be a opening brace if readBracedText funciton is called")
        
        i = 1
        while i < len(characters) and characters[i] != ')':
            i += 1
        
        bracedText = zbMathTokenizer.BracedText("".join(characters[1:i]))
        del characters[:i+1]
        return bracedText

zbMathTokenizer.stopList = readFileLinewise2Array("/home/simon/Projekte/MIRS/ClassificationFramework/util/stoplist.txt")
zbMathTokenizer.wnl = WordNetLemmatizer()

"""t = zbMathTokenizer()
print " ".join(map(lambda x : x.strMapping(), t.tokenize("this is a test (TIAT) $asdfasdf$ {\\it Grubel, Bert} TIAT is cool [zbl asdfasdf 1000.1000]")))"""
for line in f:
    x = line.split(";")
    doc2msc[str(x[0])] = x[1].strip()
f.close()

target_class = "81"
ordered_document_assignments = map(
    lambda doc_id: doc2msc[str(doc_id)]
    if doc_id in doc2msc else None, document_ids)
ordered_document_labels = map(
    lambda lab: None
    if lab is None else (1 if lab[:len(target_class)] == target_class else 0),
    ordered_document_assignments)

test_doc_ind = indexes_in_list(document_ids,
                               readFileLinewise2Array("raw_data/test_doc_ids"))
train_doc_ind = indexes_in_list(
    document_ids, readFileLinewise2Array("raw_data/train_doc_ids"))

mat = load_csr_matrix("derived_data/tfidf_theorem_tdm_grouped_by_docs.npz")
train_mat = mat[train_doc_ind, :]
train_labels = itemgetter(*train_doc_ind)(ordered_document_labels)

svd = TruncatedSVD(n_components=1000)
svd.fit(train_mat)

test_mat = mat[test_doc_ind, :]
test_labels = itemgetter(*test_doc_ind)(ordered_document_labels)

clf = svm.LinearSVC()
clf.fit(svd.transform(train_mat), train_labels)
f.close()"""

# cut test/train set out of corpus
document_ids = json.load(open("derived_data/theorem_tdm_grouped_by_docs_doc_ids"))
doc2msc = {}
f = open("raw_data/doc2msc")
for line in f:
    x = line.split(";")
    doc2msc[str(x[0])] = x[1].strip()
f.close()

target_class = "81"
ordered_document_assignments = map(lambda doc_id: doc2msc[str(doc_id)] if doc_id in doc2msc else None, document_ids)
ordered_document_labels = map(lambda lab: None if lab is None else (1 if lab[:len(target_class)] == target_class else 0), ordered_document_assignments)

test_doc_ind = indexes_in_list(document_ids, readFileLinewise2Array("raw_data/test_doc_ids"))
train_doc_ind = indexes_in_list(document_ids, readFileLinewise2Array("raw_data/train_doc_ids"))

mat = load_csr_matrix("derived_data/tfidf_theorem_tdm_grouped_by_docs.npz")
train_mat = mat[train_doc_ind, :]
train_labels = itemgetter(*train_doc_ind)(ordered_document_labels)

svd = TruncatedSVD(n_components=1000)
svd.fit(train_mat)

test_mat = mat[test_doc_ind, :]
test_labels = itemgetter(*test_doc_ind)(ordered_document_labels)

clf = svm.LinearSVC()
clf.fit(svd.transform(train_mat), train_labels)