예제 #1
0
def show_instances(class_name=''):
    name = 'train'
    if name == 'train':
        filename = Const.origin_all_train_filename
    elif name == 'dev':
        filename = Const.origin_all_dev_filename
    f = open(filename, 'r')
    content = f.readlines()
    html_doc = ' '.join(content)
    sentences_string, triples_string = utils.parse(html_doc)
    sentences_words = utils.sentence_tokenize(sentences_string)
    position_triples = utils.find_entity_position(sentences_words,
                                                  triples_string)
    sentences_word_id, sentence_triples_id = utils.turn2id(
        sentences_words, position_triples)
    utils.triples_type(sentence_triples_id)
    if class_name == 'normal':
        func = utils.is_normal_triple
    elif class_name == 'single_entity_overlap':
        func = utils.is_over_lapping
    else:
        func = utils.is_multi_label

    words2id = utils.load_words2id()
    id2words = {v: k for k, v in words2id.items()}
    for sent_words_id, triples_id in zip(sentences_word_id,
                                         sentence_triples_id):
        if func(triples_id, is_relation_first=False):
            print ' '.join([id2words[x] for x in sent_words_id])
            print triples_id
            print '-----------------------------------'
예제 #2
0
def _pre_process(news_file_name, output_file):
    """a very simple pre processing"""
    print("pre processing '%s'..." % news_file_name)
    with open(news_file_name) as f:
        content = f.readlines()

    print("writing into '%s'..." % output_file)
    with open(output_file, 'a') as f:
        # each line as a news article
        for news in content:
            news = news.strip()

            # if the news article is too short ignore it
            if len(news) < 150:
                continue

            # tokenize the news article sentence by sentence
            sentences = sentence_tokenize(news)
            # we need at least 6 lines, because we are going to remove the first and the last sentences
            # because they contain sometimes author, date and so on.
            if len(sentences) >= 6:
                for sentence in sentences[1:-1]:
                    f.write(sentence)
                    f.write('\n')
                # extra new line to separate each news articles
                f.write('\n')
    def load_data(self, unlabeled_data, ids):
        self.message = {}
        self.ids = []
        self.data_num = 0

        for i in tqdm(ids):
            try:
                sentences = []
                labels = []
                doc = unlabeled_data[i]

                doc_len = []
                sent_len = []

                doc += '.'

                results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                dd = results.sub(" <website> ", doc)
                results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                dd = results.sub(" <website> ", dd)
                results = re.compile(
                    r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)',
                    re.S)
                dd = results.sub(" <website> ", dd)

                sents = sentence_tokenize(dd)

                # print(sents)

                for j in range(0, min(len(sents), self.max_seq_num)):

                    tokens = self.tokenizer.tokenize(dd)

                    temp = tokens
                    if len(temp) > 0:
                        temp_ = ['[CLS]']
                        for k in range(0, min(len(temp),
                                              self.max_seq_len - 2)):
                            temp_.append(temp[k])
                        temp_.append('[SEP]')
                        sentences.append(temp_)
                        labels.append(10)
                        sent_len.append(len(temp_) - 1)

                doc_len.append(min(len(sents) - 1, self.max_seq_num - 1))

                self.message[i] = (sentences[:self.max_seq_num],
                                   labels[:self.max_seq_num],
                                   sent_len[:self.max_seq_num], doc_len)
                self.ids.append(i)

            except:
                #print(doc)
                #exit()
                pass
예제 #4
0
def prepare(name):
    print name
    if name == 'train':
        filename = Const.origin_all_train_filename
    if name == 'dev':
        filename = Const.origin_all_dev_filename
    if name == 'example':
        filename = Const.origin_example_filename

    print Const.triple_len
    f = open(filename, 'r')
    print filename
    content = f.readlines()
    html_doc = ' '.join(content)
    sentences_string, triples_string = utils.parse(html_doc)
    sentences_words = utils.sentence_tokenize(sentences_string)
    position_triples = utils.find_entity_position(sentences_words,
                                                  triples_string)
    sentences_word_id, sentence_triples_id = utils.turn2id(
        sentences_words, position_triples)
    if name == 'train':
        #  split train file into train and valid set
        [valid_sentences_word_id, valid_sentence_triples_id
         ], [train_sentences_word_id,
             train_sentence_triples_id] = utils.split(sentences_word_id,
                                                      sentence_triples_id)
        utils.static_triples_info(train_sentence_triples_id)
        utils.triples_type(train_sentence_triples_id)
        utils.static_triples_info(valid_sentence_triples_id)
        utils.triples_type(valid_sentence_triples_id)
        json.dump([train_sentences_word_id, train_sentence_triples_id],
                  open(Const.train_filename, 'w'))
        json.dump([valid_sentences_word_id, valid_sentence_triples_id],
                  open(Const.valid_filename, 'w'))
        utils.instances2nyt_style(
            [train_sentences_word_id, train_sentence_triples_id],
            Const.nyt_style_raw_train_filename)
        utils.instances2nyt_style(
            [valid_sentences_word_id, valid_sentence_triples_id],
            Const.nyt_style_raw_valid_filename)
    elif name == 'dev':
        utils.triples_type(sentence_triples_id)
        json.dump([sentences_word_id, sentence_triples_id],
                  open(Const.dev_filename, 'w'))
        utils.instances2nyt_style([sentences_word_id, sentence_triples_id],
                                  Const.nyt_style_raw_test_filename)
    else:
        utils.triples_type(sentence_triples_id)
        json.dump([sentences_word_id, sentence_triples_id],
                  open(Const.example_filename, 'w'))
예제 #5
0
def _pre_process(wiki_file_name):
    """a very simple pre processing"""
    print("pre processing '%s'..." % wiki_file_name)
    with open(wiki_file_name) as f:
        content = f.readlines()

    # some articles has only 1 line, so write first into an array later filter them
    articles = [[]]

    article_start = False
    # process line by line
    for line in content:
        line = line.strip()

        # ignore empty line
        if not line:
            continue
        # ignore article start
        if line.startswith('<doc'):
            article_start = True
            continue
        # after article start, there is the article title, ignore it
        if article_start:
            article_start = False
            continue
        # ignore categories
        if line.startswith('[['):
            continue
        if line.startswith('</doc>'):
            articles.append([])
            continue
        # tokenize the news article sentence by sentence
        for sentence in sentence_tokenize(line):
            articles[-1].append(sentence)

    # we need articles with at least 2 lines
    articles = [d for d in articles if len(d) >= 2]

    # write into a single file
    with open(join(MN_CORPUS_FOLDER, MN_WIKI_FILE), 'a') as f:
        for article in articles:
            for sentence in article:
                f.write(sentence)
                f.write('\n')
            # extra new line to separate each news articles
            f.write('\n')
예제 #6
0
def statics(name):
    if name == 'train':
        filename = Const.origin_all_train_filename
    elif name == 'dev':
        filename = Const.origin_all_dev_filename
    f = open(filename, 'r', encoding='utf-8')
    content = f.readlines()
    html_doc = ' '.join(content)
    sentences_string, triples_string = utils.parse(html_doc) # 读取内容
    sentences_words = utils.sentence_tokenize(sentences_string) # 分词
    relations_words = utils.static_relations(triples_string) # 对关系进行分词 并建立关系到id的映射
    sentences_words.extend(relations_words)  # not only static the words in sentences, but also the words in relations
    words2id = utils.static_words(sentences_words) # 建立单词到id的映射
    relations_words_id = [None]
    for r_words in relations_words:
        r_words_id = [utils.turn_word2id(w, words2id) for w in r_words]
        relations_words_id.append(r_words_id)
    json.dump(relations_words_id, open(Const.relations_words_id_filename, 'w', encoding='utf-8'), indent=False)
예제 #7
0
def prepare(name):
    print(name)
    if name == 'train':
        filename = Const.origin_all_train_filename
    if name == 'dev':
        filename = Const.origin_all_dev_filename
    if name == 'example':
        filename = Const.origin_example_filename

    print(Const.triple_len)
    f = open(filename, 'r', encoding='utf-8')
    print(filename)
    content = f.readlines()
    html_doc = ' '.join(content)

    if not os.path.isfile(name + "_parse.json"):
        sentences_string, triples_string = utils.parse(html_doc)
        json.dump([sentences_string, triples_string], open(name + '_parse.json', 'w', encoding='utf-8'))
    else:
        sentences_string, triples_string = json.load(open(name + '_parse.json', 'r', encoding='utf-8'))

    sentences_words = utils.sentence_tokenize(sentences_string)
    position_triples = utils.find_entity_position(sentences_words, triples_string) # 记录实体最后一个单词的索引(e1_end, e2_end, relation)
    sentences_word_id, sentence_triples_id = utils.turn2id(sentences_words, position_triples) # 将句子、关系类型转为id
    if name == 'train':
        #  split train file into train and valid set
        [valid_sentences_word_id, valid_sentence_triples_id], [train_sentences_word_id, train_sentence_triples_id] = utils.split(sentences_word_id, sentence_triples_id)
        utils.static_triples_info(train_sentence_triples_id)
        utils.triples_type(train_sentence_triples_id)
        utils.static_triples_info(valid_sentence_triples_id)
        utils.triples_type(valid_sentence_triples_id)
        json.dump([train_sentences_word_id, train_sentence_triples_id], open(Const.train_filename, 'w', encoding='utf-8'))
        json.dump([valid_sentences_word_id, valid_sentence_triples_id], open(Const.valid_filename, 'w', encoding='utf-8'))
        utils.instances2nyt_style([train_sentences_word_id, train_sentence_triples_id], Const.nyt_style_raw_train_filename)
        utils.instances2nyt_style([valid_sentences_word_id, valid_sentence_triples_id], Const.nyt_style_raw_valid_filename)
    elif name == 'dev':
        utils.triples_type(sentence_triples_id)
        json.dump([sentences_word_id, sentence_triples_id], open(Const.dev_filename, 'w', encoding='utf-8'))
        utils.instances2nyt_style([sentences_word_id, sentence_triples_id], Const.nyt_style_raw_test_filename)
    else:
        utils.triples_type(sentence_triples_id)
        json.dump([sentences_word_id, sentence_triples_id], open(Const.example_filename, 'w', encoding='utf-8'))
예제 #8
0
def _process_section(section, main_class):
    content = section.get_content().decode("utf-8")
    soup = BeautifulSoup(content, 'html.parser')

    lines = []
    children = soup.find('body').findChildren()
    for child in children:
        if child.get('class') is not None and main_class in child.get('class'):
            line = child.text.strip()
            if len(line) > 0:
                lines.append(line)
        else:
            lines.append(EMPTY_LINE)
    if len(lines) >= 2:
        lines = _unwrap_lines(lines)
    lines = [_process_line(line) for line in lines]
    lines.append(EMPTY_LINE)

    sentences = []
    for line in lines:
        sentences += sentence_tokenize(line)

    return sentences
예제 #9
0
    def load_data(self, unlabeled_data, ids):
        self.message = {}
        self.ids = []
        self.data_num = 0

        for i in ids:
            try:
                sentences = []
                labels = []
                doc = unlabeled_data[i]

                doc_len = []
                sent_len = []

                doc += '.'

                results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                dd = results.sub(" <website> ", doc)
                results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                dd = results.sub(" <website> ", dd)
                results = re.compile(
                    r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)',
                    re.S)
                dd = results.sub(" <website> ", dd)

                sents = sentence_tokenize(dd)

                # print(sents)

                for j in range(0, len(sents)):
                    a = regexp_tokenize(transform_format(sents[j]),
                                        self.pattern)
                    temp = []
                    for k in range(0, len(a)):
                        if a[k] not in self.english_punctuations and check_ack_word(
                                a[k]) == 1:
                            if a[k].isdigit():
                                a[k] = '<number>'
                            elif a[k][0] == '$':
                                a[k] = '<money>'
                            elif a[k][-1] == '%':
                                a[k] = '<percentage>'
                            temp.append(a[k].lower())

                    if len(temp) > 0:
                        temp_ = ['<sos>']
                        for k in range(0, min(len(temp),
                                              self.max_seq_len - 2)):
                            temp_.append(temp[k])
                        temp_.append('<eos>')
                        sentences.append(temp_)
                        labels.append(10)
                        sent_len.append(len(temp_) - 1)

                doc_len.append(min(len(sents) - 1, self.max_seq_num - 1))

                self.message[i] = (sentences[:self.max_seq_num],
                                   labels[:self.max_seq_num],
                                   sent_len[:self.max_seq_num], doc_len)
                self.ids.append(i)

            except:
                #print(doc)
                #exit()
                pass
예제 #10
0
    def build_vocab(self, unlabeled_data, labeled_data, embedding_size,
                    max_seq_num, max_seq_len):
        sentences = []
        words = []
        if unlabeled_data is not None:
            for (u, v) in unlabeled_data.items():
                try:
                    results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                    dd = results.sub(" <website> ", v)
                    results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                    dd = results.sub(" <website> ", dd)
                    results = re.compile(
                        r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)',
                        re.S)
                    dd = results.sub(" <website> ", dd)
                    sents = sentence_tokenize(dd)
                    for j in range(0, len(sents)):
                        a = regexp_tokenize(transform_format(sents[j]),
                                            self.pattern)
                        temp = []
                        for k in range(0, len(a)):
                            if a[k] not in self.english_punctuations and check_ack_word(
                                    a[k]) == 1:
                                if a[k].isdigit():
                                    a[k] = '<number>'
                                elif a[k][0] == '$':
                                    a[k] = '<money>'
                                elif a[k][-1] == '%':
                                    a[k] = '<percentage>'
                                temp.append(a[k].lower())
                                words.append(a[k].lower())
                        if len(temp) > 0:
                            sentences.append(temp)
                except:
                    #print(u,v)
                    #exit()
                    pass

        if labeled_data is not None:
            for (u, v) in labeled_data.items():
                for i in range(0, len(v[0])):
                    v[0][i] = str(v[0][i])
                    try:
                        results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*',
                                             re.S)
                        dd = results.sub(" <website> ", v[0][i])
                        results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*',
                                             re.S)
                        dd = results.sub(" <website> ", dd)
                        results = re.compile(
                            r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)',
                            re.S)
                        dd = results.sub(" <website> ", dd)
                    except:
                        print(u, v)
                        print(v[0][i])
                        exit()
                    a = regexp_tokenize(transform_format(dd), self.pattern)
                    temp = []
                    for k in range(0, len(a)):
                        if a[k] not in self.english_punctuations and check_ack_word(
                                a[k]) == 1:
                            if a[k].isdigit():
                                a[k] = '<number>'
                            elif a[k][0] == '$':
                                a[k] = '<money>'
                            elif a[k][-1] == '%':
                                a[k] = '<percentage>'
                            temp.append(a[k].lower())
                            words.append(a[k].lower())
                    if len(temp) > 0:
                        sentences.append(temp)

        word_frequency = {}
        for i in range(0, len(words)):
            if words[i] in word_frequency:
                word_frequency[words[i]] += 1
            else:
                word_frequency[words[i]] = 1

        self.model = gensim.models.Word2Vec(sentences,
                                            size=embedding_size,
                                            window=5,
                                            min_count=1,
                                            iter=20,
                                            negative=50)

        x = 4
        self.word2id['<pad>'] = 0
        self.id2word[0] = '<pad>'
        self.word2id['<sos>'] = 2
        self.id2word[2] = '<sos>'
        self.word2id['<eos>'] = 3
        self.id2word[3] = '<eos>'

        self.unk_count = 0

        for i in range(0, len(sentences)):
            for j in range(0, len(sentences[i])):
                if word_frequency[sentences[i][j].lower()] >= 2:
                    if sentences[i][j].lower() in self.model:
                        if sentences[i][j].lower() in self.word2id:
                            pass
                        else:
                            self.word2id[sentences[i][j].lower()] = x
                            self.id2word[x] = sentences[i][j].lower()
                            x = x + 1
                else:
                    self.word2id['<unk>'] = 1
                    self.id2word[1] = '<unk>'
                    self.unk_count += 1