示例#1
0
def text_to_word2vec(sentences, word2vec):
    new_text = []
    for sentence in sentences:
        words = text_manipulation.extract_sentence_words(sentence)
        new_text.append([word_model(w, word2vec) for w in words])

    return new_text
示例#2
0
def read_wiki_file(path,
                   word2vec,
                   remove_preface_segment=True,
                   ignore_list=False,
                   remove_special_tokens=False,
                   return_as_sentences=False,
                   high_granularity=True,
                   only_letters=False):
    data = []
    targets = []
    all_sections = get_sections(path, high_granularity)
    required_sections = all_sections[1:] if remove_preface_segment and len(
        all_sections) > 0 else all_sections
    required_non_empty_sections = [
        section for section in required_sections
        if len(section) > 0 and section != "\n"
    ]

    for section in required_non_empty_sections:
        sentences = section.split('\n')
        if sentences:
            for sentence in sentences:
                is_list_sentence = wiki_utils.get_list_token() + "." == str(
                    sentence)
                #print(is_list_sentence)
                if ignore_list and is_list_sentence:
                    # print(wiki_utils.get_list_token() + ".")
                    # print(sentence.encode('utf-8'))
                    continue
                if not return_as_sentences:
                    sentence_words = extract_sentence_words(
                        sentence, remove_special_tokens=remove_special_tokens)
                    if 1 <= len(sentence_words):
                        data.append([
                            word_model(word, word2vec)
                            for word in sentence_words
                        ])
                    else:
                        #print(sentence)
                        #print(sentence_words)
                        #raise ValueError('Sentence in wikipedia file is empty')
                        logger.info('Sentence in wikipedia file is empty')
                else:  # for the annotation. keep sentence as is.
                    if (only_letters):
                        sentence = re.sub('[^a-zA-Z0-9 ]+', '', sentence)
                        data.append(sentence)
                    else:
                        data.append(sentence)
            if data:
                targets.append(len(data) - 1)

    return data, targets, path
示例#3
0
def visualize_document(path,
                       pred,
                       golden,
                       remove_preface_segment=True,
                       ignore_list=True,
                       remove_special_tokens=True,
                       return_as_sentences=False,
                       high_granularity=False):
    dir_path = "/" + "/".join(path.strip("/").split("/")[:-1])
    # if os.path.exists(os.path.join(dir_path,"pred")) and os.path.exists(os.path.join(dir_path,"golden")):
    #     return
    all_sections = get_sections(path, high_granularity)
    required_sections = all_sections[1:] if remove_preface_segment and len(
        all_sections) > 0 else all_sections
    required_non_empty_sections = [
        section for section in required_sections
        if len(section) > 0 and section != "\n"
    ]
    final_sentence_lst = []
    for section in required_non_empty_sections:
        sentences = section.split('\n')
        if sentences:
            for sentence in sentences:
                is_list_sentence = wiki_utils.get_list_token() + "." == str(
                    sentence)
                if ignore_list and is_list_sentence:
                    continue
                if not return_as_sentences:
                    sentence_words = extract_sentence_words(
                        sentence, remove_special_tokens=remove_special_tokens)
                    if 1 > len(sentence_words):
                        continue
                    else:
                        final_sentence_lst.append(sentence)

    def write_document(sentences, path, seg, name):
        with open(os.path.join(path, name), "w") as f:
            seg_list = seg.split(" ")
            cot = 0
            for i in seg_list:
                if cot == 0:
                    f.write("=" * 10 + "开头" + "\n")
                    cot = 1
                else:
                    if i == "|":  # 分割点
                        f.write("=" * 10 + "\n")
                    else:
                        f.write(sentences[int(i) - 1] + "\n")

    write_document(final_sentence_lst, dir_path, pred, "pred")
    write_document(final_sentence_lst, dir_path, golden, "golden")
示例#4
0
def read_choi_file(path,
                   word2vec,
                   sent_bert_vec,
                   train,
                   return_w2v_tensors=True,
                   manifesto=False):
    seperator = '========' if manifesto else '=========='
    with Path(path).open('r') as f:
        raw_text = f.read()
    paragraphs = [
        clean_paragraph(p) for p in raw_text.strip().split(seperator)
        if len(p) > 5 and p != "\n"
    ]
    if train:
        random.shuffle(paragraphs)

    targets = []
    new_text = []
    text = []
    lastparagraphsentenceidx = 0

    for paragraph in paragraphs:
        if manifesto:
            sentences = split_sentences(paragraph, 0)
        else:
            sentences = [
                s for s in paragraph.split('\n') if len(s.split()) > 0
            ]

        if sentences:
            sentences_count = 0
            # This is the number of sentences in the paragraph and where we need to split.
            for sentence in sentences:
                words, sentence_str = extract_sentence_words(sentence)
                if (len(words) == 0):
                    continue
                sentences_count += 1
                if return_w2v_tensors:
                    text.append(words)
                    new_text.append([word_model(w, word2vec) for w in words])
                else:
                    text.append(words)
                    new_text.append(words)

            lastparagraphsentenceidx += sentences_count
            targets.append(lastparagraphsentenceidx - 1)

    return new_text, targets, path, sent_bert_vec
示例#5
0
def main(args):
    utils.read_config_file(args.config)
    utils.config.update(args.__dict__)

    algo_delimeter = graphseg_delimeter

    files = get_files(args.folder)
    acc = accuracy.Accuracy()

    for file_path in files:
        file = open(str(file_path), "r")
        raw_content = file.read()
        file.close()
        sentences = [
            s for s in raw_content.decode('utf-8').strip().split("\n")
            if len(s) > 0 and s != "\n"
        ]
        sentences_length = []
        h = []
        t = []
        is_first_sentence = True
        for sentence in sentences:
            if sentence == truth:
                if not is_first_sentence:
                    t[-1] = 1
                continue
            if sentence == algo_delimeter:
                if not is_first_sentence:
                    h[-1] = 1
                continue
            words = extract_sentence_words(sentence)
            sentences_length.append(len(words))
            t.append(0)
            h.append(0)
            is_first_sentence = False
        t[-1] = 1  # end of last segment
        h[-1] = 1  # they already segment it correctly.

        acc.update(h, t)

    calculated_pk, calculated_windiff = acc.calc_accuracy()
    print 'Pk: {:.4}.'.format(calculated_pk)
    print 'Win_diff: {:.4}.'.format(calculated_windiff)
def process_section(section, id):
    global num_sentneces_for_avg
    global sum_sentneces_for_avg
    sentences = text_manipulation.split_sentences(section, id)
    section_sentences = []
    num_lists = 0
    num_sentences = 0
    num_formulas = 0
    num_codes = 0
    last_sentence_was_list = False
    for sentence in sentences:
        is_list_sentence = wiki_utils.get_list_token(
        ) + "." == sentence.encode('utf-8')
        if '\n' in sentence:
            logger.info("DocId: " + str(id) + "   back slash in sentence: " +
                        sentence)
        if (wiki_utils.get_list_token() in sentence) and (
                wiki_utils.get_list_token() + ".") != sentence.encode('utf-8'):
            # TODO: delete this if section, since it is not suupposed to happen any more - but still happen
            num_lists += 1
            last_sentence_was_list = True
            logger.info("DocId: " + str(id) + "     Special case 1: " +
                        sentence)
            continue
        elif is_list_sentence:
            if (last_sentence_was_list):
                continue
            last_sentence_was_list = True
            num_lists += 1
        else:
            last_sentence_was_list = False
            sentence_words = text_manipulation.extract_sentence_words(sentence)
            if len(sentence_words) < wiki_thresholds.min_words_in_sentence:
                # ignore this sentence
                continue
            sum_sentneces_for_avg += len(sentence_words)
            num_sentneces_for_avg += 1

        num_formulas += count_str_occurrences(sentence,
                                              wiki_utils.get_formula_token())
        num_codes += count_str_occurrences(sentence,
                                           wiki_utils.get_codesnipet_token())
        num_sentences += 1
        section_sentences.append(sentence)

    valid_section = True
    error_message = None
    if (num_sentences < wiki_thresholds.min_sentence_in_section):
        valid_section = False
        error_message = "sentences count in section is too low"

    if (num_sentences > 0):
        lists_perentage = float(num_lists) / float(num_sentences)
        if lists_perentage >= wiki_thresholds.max_list_in_section_percentage:
            valid_section = False
            error_message = "list percentage in section is too high: " + str(
                lists_perentage)

    section_text = ''.join(section_sentences)
    if len(section_text) < wiki_thresholds.min_section_char_count:
        valid_section = False
        error_message = "char count in section is too low"

    if num_formulas >= wiki_thresholds.max_section_formulas_count:
        valid_section = False
        error_message = "number of formulas in section is too high: " + str(
            num_formulas)

    if num_codes >= wiki_thresholds.max_section_code_snipet_count:
        valid_section = False
        error_message = "number of code snippets in section is too high: " + str(
            num_codes)

    return valid_section, section_sentences, error_message