def wrapper(istring, annotator, index): cur_dir = os.getcwd() batch_path = cur_dir + r'\pretag\pretag.bat' # this is the batch file print batch_path os.chdir(".\data_final") q_id = istring q_raw_content = annotator + q_id + '-raw.txt' q_tokenize = annotator + q_id + '-tokenized.txt' q_transformin = annotator + q_id + '-input2tagger.txt' q_pretag = annotator + q_id + '-pretagged.txt' q_final = annotator + q_id + '-final.txt' readSQL.read_knowledge_unit(q_id, q_raw_content) mytokenizer.tokenize(q_raw_content, q_tokenize) transforminput.transforminput(q_tokenize, q_transformin) os.chdir("..\pretag") child = subprocess.Popen([batch_path, "..\\data_final\\" + q_transformin, "..\\data_final\\" + q_pretag]) stdout, stderr = child.communicate() child.wait() # wait() is important. print("parent process") os.chdir("..\data_final") transformoutput.transformoutput(q_id, q_pretag, q_final) shutil.copy2(q_final, annotator+str(index)+'.automatic_tags') os.chdir("..") print("program finish")
cur.execute("SELECT Id FROM posts where ParentId=%s" %(question_id)) answers = cur.fetchall() for row in answers: answer_id = row[0] cur.execute("SELECT Body FROM posts where Id=%s" %(answer_id)) ans_body = cur.fetchall() body += ans_body cur.execute("SELECT Text FROM comments where PostId=%s" %(answer_id)) ans_comments = cur.fetchall() comments += ans_comments uni = ''.join( my_encoder( title[0][0] ) ) f.write(uni + '\n') for row in body: #uni = unicode(strip_tags(row[0]), 'utf-8', errors='replace_against_space') uni = ''.join( my_encoder( strip_tags(''.join(my_encoder(row[0]) ) ) ) ) f.write(uni + '\n') for row in comments: uni = ''.join( my_encoder( strip_backtick(row[0]) ) ) f.write(uni + '\n') f.close() if __name__=='__main__': # html = '''asdfasdf''' # print strip_tags(''.join(my_encoder(html))) dump('raw.txt') # dump to input_brown.txt mytokenizer.tokenize('raw.txt', 'tokenized.txt')