def reformat_iob(input_fname, output_fname,lang_code): """ TODO * this should go into the Utils module * add support for abbreviation file for treetagger, to pass with -a param from cli Utility function. Reformat an existing IOB file applying a tokenisation based on punctuation instead of white spaces. The IOB tags get transferred to the newly created tokens. Args: input_fname: a string, being the path to the input file output_fname: a string, being the path to the output file lang_code: the language of the file content, important for tokenisation and POS """ from citation_extractor.Utils import IO from urllib import urlopen import re import codecs result = [] file = codecs.open(input_fname,"r",'utf-8') data = file.read() file.close() sentences = IO.read_instances(data) plain_sentences = [] for s in sentences: plain = [t[0] for t in s] plain_sentences.append(" ".join(plain)) for n,sent in enumerate(sentences): new_sent = [] wt_sent = tokenise_and_tag(plain_sentences[n],lang_code) read = 0 # is a pointer which helps to synchronize the reading between the two streams of tokens prev_tok = "" unic = False for n,tok in enumerate(wt_sent): if(type(tok[0])!=type(u"x")): try: token = tok[0].decode('utf-8') except Exception, e: token = tok[0].decode('latin-1') else: unic = True token = tok[0] #print type(token) pos_tag = None if(tok[1] == ''): pos_tag = tok[2] elif(tok[1] != ''): pos_tag = tok[1] if(token == sent[read][0]): # the two tokens are identical new_sent.append((tok[0],pos_tag,sent[read][1])) read += 1 elif("%s%s"%(prev_tok,token) == sent[read][0]): # current + previous token are equal to the token in the other stream #print "eureka" label = sent[read][1] if(re.match(r"B-",sent[read][1]) is not None): label = re.sub(r"B-","I-",sent[read][1]) new_sent.append((tok[0],pos_tag,label)) read += 1 elif(token in sent[read][0]): # TODO if(re.match("^%s.*"%re.escape(tok[0]),sent[read][0])): new_sent.append((tok[0],pos_tag,sent[read][1])) else: label = sent[read][1] if(re.match(r"B-",sent[read][1]) is not None): label = re.sub(r"B-","I-",sent[read][1]) new_sent.append((tok[0],pos_tag,label)) else: read += 1 new_sent.append((tok[0],pos_tag,sent[read][1])) result.append(new_sent)