def tempMicrotaskParser(): data = '' results = Subtask.objects.filter(assigned=0) print results if results: subtask = results[0] #subtask id subtask_id = subtask.id sub = Subtask.objects.get(id=subtask_id) task_id = sub.task_id task = Task.objects.get(id=task_id) sub.assigned = 1 k = 0 b = Master_Experiment.objects.get(pk=4) data = str(sub.original_data) lst = data.split("\r\n\r\n") td = str(sub.translated_data) itd = td.split("\n\n") for each in lst: micro = StaticMicrotask() micro.subtask = subtask micro.task = task micro.original_sentence = each micro.bit_array = b micro.meaning = dictionary(each) micro.save() micro.machine_translation = itd[k] dat = serializers.serialize( 'json', StaticMicrotask.objects.filter(pk=micro.id), fields=('meaning'), ensure_ascii=False) micro.meaning = dat micro.save() k += 1 sub.save() msg = "" else: msg = "No subtask available for parsing" return msg
def tempMicrotaskParser(): data = '' results = Subtask.objects.filter(assigned = 0) print results if results: subtask = results[0]#subtask id subtask_id = subtask.id sub = Subtask.objects.get(id = subtask_id) task_id = sub.task_id task = Task.objects.get(id = task_id) sub.assigned = 1 k = 0 b = Master_Experiment.objects.get(pk=4) data = str(sub.original_data) lst = data.split("\r\n\r\n") td = str(sub.translated_data) itd = td.split("\n\n") for each in lst: micro = StaticMicrotask() micro.subtask = subtask micro.task = task micro.original_sentence = each micro.bit_array = b micro.meaning = dictionary(each) micro.save() micro.machine_translation = itd[k] dat = serializers.serialize('json', StaticMicrotask.objects.filter(pk=micro.id), fields=('meaning'), ensure_ascii=False) micro.meaning = dat micro.save() k += 1 sub.save() msg = "" else: msg = "No subtask available for parsing" return msg
def microtaskParser(): data = '' results = Subtask.objects.filter(assigned = 0) if results: subtask = results[0]#subtask id subtask_id = subtask.id sub = Subtask.objects.get(id = subtask_id) task_id = sub.task_id task = Task.objects.get(id = task_id) sub.assigned = 1 data = str(sub.original_data) data = re.sub('[\t\n]+','\n',data) data = re.sub('(\[see\])',r'', data) data = re.sub('(\[citation needed\])',r'', data) data = re.sub('(\[cite.*\])',r'', data) data = re.sub('(\[deadlink\])',r'', data) data = re.sub('(\[disambi*\])',r'', data) data = re.sub('(\[note*\])',r'', data) data = re.sub('(\[[0-9]+\])', r'',data) lst = data.split("\n") string = '' for each1 in lst: sent = each1.split(". ") for each in sent: string = string + '. SEN_END ' + (''.join(str(each))).lstrip() string = re.sub('([\?\.\!]+\s*)(\[\d+\]\s*)(\w+)', r'\1 SEN_END \2\3', string) string = re.sub('(\.\"[\t\n\s]+)([A-Z]+)', r'\1SEN_END \2', string) match_list = RE.findall(string) for abr in match_list: string = re.sub(r'\b[ \s\W]+' + abr + r'. SEN_END\b', " "+abr+".", string) string = re.sub('(SEN_END)\s*([a-z]+)',r'\s\2', string) string = re.sub('([\s\W]+)([A-Z0-9][\.\?\!]\s+)(SEN_END)', r'\1\2',string) string = re.sub('\.+','.',string) string = re.sub('[\t\s]+',' ',string) #print string i = 0 splitlist = string.split("SEN_END") b = Master_Experiment.objects.all() td = str(sub.translated_data) itd = td.split("\n\n") k = 0 for each in splitlist: each1= each if not(each == '. ' or each == '' or each == ' '): #if single word translate and replace...google api?? if (len(each.split(' '))<=4): flag = 1 #code yet to be written #translate that word as it is and store, code to be writen, google api?? else: if int(i) == 10: i = 0 a = b[i] micro = StaticMicrotask() micro.subtask = subtask micro.task = task micro.original_sentence = each micro.bit_array = a micro.meaning = dictionary(each) micro.save() micro.machine_translation = itd[k] dat = serializers.serialize('json', StaticMicrotask.objects.filter(pk=micro.id), fields=('meaning'), ensure_ascii=False) micro.meaning = dat micro.save() i += 1 k += 1 sub.save() msg = "" else: msg = "No subtask available for parsing" return msg
def microtaskParser(): data = '' results = Subtask.objects.filter(assigned = 0) subtask = results[0]#subtask id subtask_id = subtask.id sub = Subtask.objects.get(id = subtask_id ) task_id = sub.task_id task = Task.objects.get(id = task_id) sub.assigned = 1 data = str(sub.original_data) data = re.sub('[\t\n]+','\n',data) data = re.sub('(\[see\])',r'', data) data = re.sub('(\[citation needed\])',r'', data) data = re.sub('(\[cite.*\])',r'', data) data = re.sub('(\[deadlink\])',r'', data) data = re.sub('(\[disambi*\])',r'', data) data = re.sub('(\[note*\])',r'', data) data = re.sub('([\s\W]*)(\[[0-9]+\])', r'',data) lst = data.split("\n") string = '' for each1 in lst: sent = each1.split(". ") if len(sent) > 1: #for hamdling acronyms LIST TO BE UPDATED for each in sent: each_index = sent.index(each) match_list = RE.findall(each) for i in match_list: if each.endswith(i): #join this and next sentence and relace the two sentences in the list join_before = sent.pop(each_index) try: join_after = sent.pop(each_index + 1) except: join_after = '' each = join_before + '. ' + join_after sent.insert(each_index,each) break string = string + '. SEN_END ' + (''.join(str(each))).lstrip() else: string = string + '. SEN_END ' + (''.join(str(each1))).lstrip() string = re.sub('([\?\.\!]+\s*)(\[\d+\]\s*)(\w+)', r'\1 SEN_END\2\3', string) string = re.sub('(SEN_END)\s*([a-z]+)',r'\2', string) string = re.sub('([\s\W]+)([A-Z0-9][\.\?\!]\s+)(SEN_END)', r'\1\2',string) flag = 0 i = 0 splitlist = string.split("SEN_END") b = Master_Experiment.objects.all() for each in splitlist: each1= unicode(each) if not(each == '. ' or each == '' or each == ' '): #if single word translate and replace...google api?? if (len(each.split(' '))<=4): flag = 1 #code yet to be written #translate that word as it is and store, code to be writen, google api?? else: if int(i) == 10: i = 0 a = b[i] micro = StaticMicrotask() micro.subtask = subtask micro.task = task micro.original_sentence = each micro.bit_array = a micro.save() i += 1 sub.save()
def microtaskParser(): data = '' results = Subtask.objects.filter(assigned=0) if results: subtask = results[0] #subtask id subtask_id = subtask.id sub = Subtask.objects.get(id=subtask_id) task_id = sub.task_id task = Task.objects.get(id=task_id) sub.assigned = 1 data = str(sub.original_data) data = re.sub('[\t\n]+', '\n', data) data = re.sub('(\[see\])', r'', data) data = re.sub('(\[citation needed\])', r'', data) data = re.sub('(\[cite.*\])', r'', data) data = re.sub('(\[deadlink\])', r'', data) data = re.sub('(\[disambi*\])', r'', data) data = re.sub('(\[note*\])', r'', data) data = re.sub('(\[[0-9]+\])', r'', data) lst = data.split("\n") string = '' for each1 in lst: sent = each1.split(". ") for each in sent: string = string + '. SEN_END ' + (''.join(str(each))).lstrip() string = re.sub('([\?\.\!]+\s*)(\[\d+\]\s*)(\w+)', r'\1 SEN_END \2\3', string) string = re.sub('(\.\"[\t\n\s]+)([A-Z]+)', r'\1SEN_END \2', string) match_list = RE.findall(string) for abr in match_list: string = re.sub(r'\b[ \s\W]+' + abr + r'. SEN_END\b', " " + abr + ".", string) string = re.sub('(SEN_END)\s*([a-z]+)', r'\s\2', string) string = re.sub('([\s\W]+)([A-Z0-9][\.\?\!]\s+)(SEN_END)', r'\1\2', string) string = re.sub('\.+', '.', string) string = re.sub('[\t\s]+', ' ', string) #print string i = 0 splitlist = string.split("SEN_END") b = Master_Experiment.objects.all() td = str(sub.translated_data) itd = td.split("\n\n") k = 0 for each in splitlist: each1 = each if not (each == '. ' or each == '' or each == ' '): #if single word translate and replace...google api?? if (len(each.split(' ')) <= 4): flag = 1 #code yet to be written #translate that word as it is and store, code to be writen, google api?? else: if int(i) == 10: i = 0 a = b[i] micro = StaticMicrotask() micro.subtask = subtask micro.task = task micro.original_sentence = each micro.bit_array = a micro.meaning = dictionary(each) micro.save() micro.machine_translation = itd[k] dat = serializers.serialize( 'json', StaticMicrotask.objects.filter(pk=micro.id), fields=('meaning'), ensure_ascii=False) micro.meaning = dat micro.save() i += 1 k += 1 sub.save() msg = "" else: msg = "No subtask available for parsing" return msg