def load_lines(mylineformat_file):
    sents = []
    all_ent_ins = []
    sent_counter = 0
    raw_sents = []
    f = open(mylineformat_file)
    for line in f:
        parts = line.split('\t')
        if len(parts) != 5:
            print line
        assert len(parts) == 5
        ent_mid = parts[1].strip()  #parseents(parts[1])
        #         myent = parts[1]
        #         myent_mid = myent.split('/')[2]
        text = parts[4]
        subsents = getsentences(text)
        for i, sent in enumerate(subsents):
            if has_ent(sent, ent_mid):
                if len(subsents) > 1: sent = sent.strip() + ' .'
                (formatted_sent,
                 ent_inds) = convert_text2figer_format(sent, sent_counter,
                                                       ent_mid)
                sents.append(formatted_sent + '\n')
                all_ent_ins.append(ent_inds)
                raw_sents.append(getrawsent(sent))
                sent_counter += 1
                break  # only the first sentence of the line --
    return (sents, all_ent_ins, raw_sents)
def load_lines(mylineformat_file):
    sents = []
    all_ent_ins = []
    sent_counter = 0
    raw_sents = []
    f = open(mylineformat_file)
    for line in f:
        parts = line.split('\t')
        if len(parts) != 5:
            print line
        assert  len(parts) == 5
        ent_mid = parts[1].strip()#parseents(parts[1])
#         myent = parts[1]
#         myent_mid = myent.split('/')[2]
        text = parts[4]
        subsents = getsentences(text)
        for i, sent in enumerate(subsents):
            if has_ent(sent, ent_mid):
                if len(subsents) > 1: sent = sent.strip() + ' .'
                (formatted_sent, ent_inds) = convert_text2figer_format(sent, sent_counter, ent_mid)
                sents.append(formatted_sent + '\n')
                all_ent_ins.append(ent_inds)
                raw_sents.append(getrawsent(sent))
                sent_counter += 1
                break # only the first sentence of the line -- 
    return (sents, all_ent_ins, raw_sents)
def convert_text2figer_format(sent, sent_counter, ent_mid):

    new_lined_formatted = ''
    ent_inds = []
    tokens = sent.strip().split(' ')
    token_counter = 0
    first_occr = True
    for token in tokens:
        if has_ent(token, ent_mid) and first_occr == True:
            (mid, ent_tokens, notabletype) = getentparts(token)
            new_lined_formatted += ent_tokens[0] + '\tB-E\n'
            if len(ent_tokens) > 1:
                for i in range(1, len(ent_tokens)):
                    new_lined_formatted += ent_tokens[i] + '\tI-E\n'
            ent_ind = str(sent_counter) + '\t' + str(
                token_counter) + '\t' + mid + '\t' + str(
                    ent_tokens) + '\t' + notabletype
            ent_inds.append(ent_ind)
            first_occr = False
        elif '/m/' in token:
            (mid, ent_tokens, notabletype) = getentparts(token)
            for t in ent_tokens:
                new_lined_formatted += t + '\tO\n'
        else:
            new_lined_formatted += token + '\tO\n'
        token_counter += 1
    return (new_lined_formatted, ent_inds)
def convert_text2figer_format(sent, sent_counter, ent_mid):

    new_lined_formatted = ''
    ent_inds = []
    tokens = sent.strip().split(' ')
    token_counter = 0
    first_occr = True
    for token in tokens: 
        if has_ent(token, ent_mid) and first_occr == True:
            (mid, ent_tokens, notabletype) = getentparts(token)
            new_lined_formatted += ent_tokens[0] + '\tB-E\n'
            if len(ent_tokens) > 1:
                for i in range(1, len(ent_tokens)):
                    new_lined_formatted += ent_tokens[i] + '\tI-E\n'
            ent_ind = str(sent_counter) + '\t' + str(token_counter) + '\t' + mid + '\t' + str(ent_tokens) + '\t' + notabletype
            ent_inds.append(ent_ind)
            first_occr = False
        elif '/m/' in token:
            (mid, ent_tokens, notabletype) = getentparts(token)
            for t in ent_tokens:
                new_lined_formatted += t + '\tO\n'
        else:
            new_lined_formatted += token + '\tO\n'
        token_counter += 1
    return (new_lined_formatted, ent_inds)