def pre_post_cleansing(rownumber, csvfilename):
    reader = csv.DictReader(open(csvfilename, encoding = "utf-8"))
    rows = [row for row in reader]
    before = rows[rownumber]["srvc_req_prob_text"]
    after = wordslist2string(cleanStringAndLemmatize(before))
    print(before, " becomes ", after)
    return before, after
def onecorpus(csvfilename, num):
    preparecwd(num)
    if not os.path.exists(str(num)):
        os.makedirs(str(num))
    reader = csv.DictReader(open(csvfilename, encoding = "utf-8"))
    for row in reader:
        name = row["sr_closer_name"]
        docname = name.replace("," , "_").replace(" " , "")
        filepath = str(num) + docname + ".txt"
        with open(filepath, "a", encoding = "utf-8") as file:
            #write processed string, not raw string from csv
            newline = wordslist2string(cleanStringAndLemmatize(str(row["srvc_req_prob_text"])))
            if len(newline) > 2:
                newline = newline + " \n "
                file.write(newline)
def twocorpora(csvfilename, num, percentintraining):
    preparecwd(num)
    if not os.path.exists(str(num) + "/TrainCorpus"):
        os.makedirs(str(num) + "/TrainCorpus")
    if not os.path.exists(str(num) + "/TestCorpus"):
        os.makedirs(str(num) + "/TestCorpus")
    reader = csv.DictReader(open(csvfilename, encoding = "utf-8"))
    for row in reader:
        name = row["sr_closer_name"]
        docname = name.replace("," , "_").replace(" " , "")
        if random.randrange(0,100)/100 < percentintraining:
            filepath = str(num) + "/TrainCorpus"+ "/" + docname + ".txt"
        else:
            filepath = str(num) + "/TestCorpus"+ "/" + docname + ".txt"
        with open(filepath, "a", encoding = "utf-8") as file:
            #write processed string, not raw string from csv
            newline = wordslist2string(cleanStringAndLemmatize(str(row["srvc_req_prob_text"])))
            if len(newline) > 2:
                newline = newline + " \n "
                file.write(newline)
def match():
    try:
        print(request)
        myjson = request.get_json(force=True)
        print("this is ",myjson)
    except Exception as inst:
        print(type(inst),inst.args,inst,"get getTSEs get json Fail!")
        return errorResponseVanilla("getTSEs "+str(type(inst)), status=500)
    try:
        print(myjson["problem_description"])
        probdesc = wordslist2string(cleanStringAndLemmatize(myjson["problem_description"]))
        predictions = temp.queryalgorithm(probdesc)
        predictions = [changeback(prediction) for prediction in predictions]

    except Exception as inst:
        return errorResponseVanilla("getTSEs "+str(type(inst)), status=500)
    data = {
        "error":"",
        "TSEs": predictions
    }
    print(data)
    js = json.dumps(data)
    return jsonVanilla(js,status=200)
 def __iter__(self):
     for row in self.reader:
         name = row["sr_closer_name"]
         docname = name.replace("," , "_").replace(" " , "")
         newline = wordslist2string(cleanStringAndLemmatize(str(row["srvc_req_prob_text"])))
         yield docname, newline
# __author__ = 'basil.beirouti'

import csv, datetime, random
from BM25 import last_thousand
from BM25.Scheduling import whos_on, read_filtered_csv_file, read_raw_schedule_csv, write_filtered_csv_file, this_year, docmatrix_data
from BM25.Plugins import tuples_tse_psums_concat
from BM25.TextCleaning import wordslist2string, cleanStringAndLemmatize
from BM25.BM25Okapi import QueryMaster, DocMatrix

def rand_divide(data, proportion):
    lendata = len(data)
    numgroup1 = round(proportion*lendata)
    numgroup2 = lendata-numgroup1
    random.shuffle(data)
    random.shuffle(data)
    group1 = data[0:numgroup1]
    group2 = data[numgroup1:]
    assert(numgroup1 == len(group1))
    assert(numgroup2 == len(group2))
    return group1, group2

rows = read_filtered_csv_file()
on_now = whos_on(rows)
personids = [el[1] for el in on_now]
out = docmatrix_data(personids, 500)
srnums, badgenums, personids, fns, lns, psums, dates = zip(*out)
tupsdata = [(el[3] + el[4], el[5]) for el in out]
cleaned_data= [(el[0], wordslist2string(cleanStringAndLemmatize(el[1]))) for el in tupsdata]
train, test = rand_divide(cleaned_data, 0.75)
processed_data = tuples_tse_psums_concat(train)
DocMatrix(processed_data, )
def clean_docmatrix_data(raw_data):
    #lastname_firstname, cleanedproblemsummary
    temp = [(el[3].replace(" ", "") + "_" + el[2].replace(" ", ""), wordslist2string(cleanStringAndLemmatize(el[4]))) for el in raw_data]
    temp.sort(key = operator.itemgetter(0))
    return temp
def csv_to_tups(csvfilename):
    reader = csv.DictReader(open(csvfilename, encoding = "utf-8"))
    alldata = list((changename(row["sr_closer_name"]), wordslist2string(cleanStringAndLemmatize(row["srvc_req_prob_text"])))for row in reader)
    alldata = sorted(alldata, key = operator.itemgetter(0))
    return alldata