def __init__(self):
        self.data = []
        self.dictionary = Dictionary()
        self.max_sent_len = 0

        # Read the positive reviews
        with open(POSITIVE_REVIEWS_FILE, encoding='utf-8') as f:
            positive_reviews = f.readlines()
        for review in positive_reviews:
            review = normalize_string(review)
            review_words = word_tokenize(review)
            self.dictionary.add_words(review_words)
            self.data.append((review, 1))
            self.max_sent_len = max(self.max_sent_len, 2 + len(review_words))

        # Read the negative reviews
        with open(NEGAGIVE_REVIEWS_FILE, encoding='utf-8') as f:
            negative_reviews = f.readlines()
        for review in negative_reviews:
            review = normalize_string(review)
            review_words = word_tokenize(review)
            self.dictionary.add_words(review_words)
            self.data.append((review, 0))
            self.max_sent_len = max(self.max_sent_len, 2 + len(review_words))

        # Split the original dataset into train/test
        random.shuffle(self.data)
        split_index = int(0.9 * len(self.data))
        self.train = AugmentedList(self.data[:split_index])
        self.test = AugmentedList(self.data[split_index:])
示例#2
0
def score_it_s(s, d):
    sents_in_doc = docsentdic[d]
    doc_words_lower_lemmatized = []
    for sent in sents_in_doc:
        doc_words_lower_lemmatized += [
            lemmatizer.lemmatize(t)
            for t in word_tokenize(sentwordrawdic[sent], lower=True)
        ]
    doc_words_lower_lemmatized = [
        t for t in doc_words_lower_lemmatized
        if t not in stopwords.words('english')
    ]
    sent_words_lower_lemmatized = [
        lemmatizer.lemmatize(t) for t in word_tokenize(s, lower=True)
    ]
    sent_words_lower_lemmatized = [
        t for t in sent_words_lower_lemmatized
        if t not in stopwords.words('english')
    ]

    return len([
        t for t in sent_words_lower_lemmatized
        if t in index_term_lower_lemmatized
    ]) / len([
        t
        for t in doc_words_lower_lemmatized if t in index_term_lower_lemmatized
    ])
示例#3
0
def convert_to_features(config, data, word2idx_dict, char2idx_dict):
    example = {}
    context, question = data
    context = context.replace("''", '" ').replace("``", '" ')
    question = question.replace("''", '" ').replace("``", '" ')
    example['context_tokens'] = word_tokenize(context)
    example['ques_tokens'] = word_tokenize(question)
    example['context_chars'] = [
        list(token) for token in example['context_tokens']
    ]
    example['ques_chars'] = [list(token) for token in example['ques_tokens']]

    para_limit = config.para_limit
    ques_limit = config.ques_limit
    # ans_limit = 100
    char_limit = config.char_limit

    def filter_func(_example):
        return len(_example["context_tokens"]) > para_limit or \
               len(_example["ques_tokens"]) > ques_limit

    if filter_func(example):

        raise ValueError("Context/Questions lengths are over the limit")

    context_idxs = np.zeros([para_limit], dtype=np.int32)
    context_char_idxs = np.zeros([para_limit, char_limit], dtype=np.int32)
    ques_idxs = np.zeros([ques_limit], dtype=np.int32)
    ques_char_idxs = np.zeros([ques_limit, char_limit], dtype=np.int32)

    def _get_word(word):
        for each in (word, word.lower(), word.capitalize(), word.upper()):
            if each in word2idx_dict:
                return word2idx_dict[each]
        return 1

    def _get_char(_char):
        if _char in char2idx_dict:
            return char2idx_dict[_char]
        return 1

    for i, token in enumerate(example["context_tokens"]):
        context_idxs[i] = _get_word(token)

    for i, token in enumerate(example["ques_tokens"]):
        ques_idxs[i] = _get_word(token)

    for i, token in enumerate(example["context_chars"]):
        for j, char in enumerate(token):
            if j == char_limit:
                break
            context_char_idxs[i, j] = _get_char(char)

    for i, token in enumerate(example["ques_chars"]):
        for j, char in enumerate(token):
            if j == char_limit:
                break
            ques_char_idxs[i, j] = _get_char(char)

    return context_idxs, context_char_idxs, ques_idxs, ques_char_idxs
示例#4
0
def get_answer(content, question, session, model, word_dictionary, char_dictionary, config):
    try:
        content_tokenized = word_tokenize(content.replace("''", '" ').replace("``", '" '))
        content = ''.join(content_tokenized[:config.para_limit])

        candidate_keys = ['什么', '谁', '哪', '几', '何', '多', '是否', '怎么', '嘛', '怎样']
        question_tokenized = word_tokenize(question.replace("''", '" ').replace("``", '" '))
        if len(question_tokenized) > config.ques_limit:
            find, pos = False, 0
            for idx, token in enumerate(question_tokenized):
                for key in candidate_keys:
                    if key in token:
                        find, pos = True, idx
                        break
                if find:
                    break
            if find:
                question = ''.join(
                    question_tokenized[max(0, pos - int(config.ques_limit/2 + 1)):
                                       min(pos + int(config.ques_limit/2 - 1), len(question_tokenized)-1)])
            else:
                question = ''.join(question_tokenized[len(question_tokenized) - config.ques_limit + 1:])

        c, ch, q, qh = convert_to_features(config, (content, question), word_dictionary, char_dictionary)
        fd = {'context:0': [c],
              'question:0': [q],
              'context_char:0': [ch],
              'question_char:0': [qh]}

        yp1, yp2 = session.run([model.yp1, model.yp2], feed_dict=fd)
        yp2[0] += 1
        return "".join(content_tokenized[yp1[0]:yp2[0]])
    except ValueError:
        print("ValueError triggered!")
        return None
    def split_data(self, filename):
        self.load_data(filename)
        sub_dir = filename.split('-')[0]

        # create a subdirectory for Train and Dev data
        if not os.path.exists(os.path.join(self.data_dir, sub_dir)):
            os.makedirs(os.path.join(self.data_dir, sub_dir))

        with open(os.path.join(self.data_dir, sub_dir, sub_dir + '.context'), 'w', encoding="utf-8") as context_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.question'), 'w', encoding="utf-8") as question_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.answer'), 'w', encoding="utf-8") as answer_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.labels'), 'w', encoding="utf-8") as labels_file:

            # loop over the data
            for article_id in tqdm.tqdm(range(len(self.data['data']))):
                list_paragraphs = self.data['data'][article_id]['paragraphs']
                # loop over the paragraphs
                for paragraph in list_paragraphs:
                    context = paragraph['context']
                    context = clean_text(context)
                    context_tokens = [w for w in word_tokenize(context) if w]
                    spans = convert_idx(context, context_tokens)
                    qas = paragraph['qas']
                    # loop over Q/A
                    for qa in qas:
                        question = qa['question']
                        question = clean_text(question)
                        question_tokens = [w for w in word_tokenize(question) if w]
                        if sub_dir == "train":
                            # select only one ground truth, the top answer, if any answer
                            answer_ids = 1 if qa['answers'] else 0
                        else:
                            answer_ids = len(qa['answers'])
                        labels = []
                        if answer_ids:
                            for answer_id in range(answer_ids):
                                answer = qa['answers'][answer_id]['text']
                                answer = clean_text(answer)
                                answer_tokens = [w for w in word_tokenize(answer) if w]
                                answer_start = qa['answers'][answer_id]['answer_start']
                                answer_stop = answer_start + len(answer)
                                answer_span = []
                                for idx, span in enumerate(spans):
                                    if not (answer_stop <= span[0] or answer_start >= span[1]):
                                        answer_span.append(idx)
                                if not answer_span:
                                    continue
                                labels.append(str(answer_span[0]) + ' ' + str(answer_span[-1]))

                            # write to file
                            context_file.write(' '.join([token for token in context_tokens]) + '\n')
                            question_file.write(' '.join([token for token in question_tokens]) + '\n')
                            answer_file.write(' '.join([token for token in answer_tokens]) + '\n')
                            labels_file.write("|".join(labels) + "\n")
示例#6
0
def process_article(args):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).
    """
    text, lemmatize, title, pageid = args
    categories = get_categories(text)
    if not list(set(categories).intersection(input_categories)):
        return None, None, None, None
    text = filter_wiki(text)
    sentences = sentence_tokenize(text)
    title = title.replace(' ', '_')
    paragraphs = {}

    # Split document into paragraphs
    # sentences = [s0, s1, t0, s2, t1, ...]
    paragraph_title = [title]
    level = 1
    this_sentences = []

    for sent in sentences:
        # Sent is a paragraph title
        if sent[:1] == '=':
            pt = '/'.join(paragraph_title)
            pt = pt.replace(',', '')
            paragraphs[pt] = this_sentences
            this_sentences = []
            # Level of paragraph
            level = max(len(s) for s in re.findall(r'=+', sent))
            this_title = sent[level:len(sent)-level].strip().replace(' ', '_')
            if level > len(paragraph_title):
                paragraph_title.append(this_title)
            elif level < len(paragraph_title):
                for i in range(len(paragraph_title)-level):
                    paragraph_title.pop()
                paragraph_title[level-1] = this_title
            else:
                paragraph_title[level-1] = this_title
        else:
            this_sentences.append(sent)
    pt = '/'.join(paragraph_title)
    pt = pt.replace(',', '')
    paragraphs[pt] = this_sentences

    if lemmatize:
        result = {k: [utils.lemmatize(s) for s in v if len(utils.lemmatize(s)) >= 2] for k, v in paragraphs.items() if len(v) >= 0}
    else:
        result = {k: [word_tokenize(s) for s in v if len(word_tokenize(s)) >= 2] for k, v in paragraphs.items() if len(v) >= 0}
    return categories, result, title, pageid
    def get_terms_and_words(self, field):
        words = [
            self.analyzer.parse(word)[0].normal_form for word in word_tokenize(field)
            if word not in stop_words]

        terms = set(words)
        return terms, words
示例#8
0
 def _tokenize(self, text):
     if not self._pristine_input:
         text = text.lower()
     if self.word_tokens:
         if self._pristine_input:
             return text.split()
         return word_tokenize(text)
     return text
示例#9
0
def get_clean_text(text_list):
    """
    :param text_list: a list of strings
    :return: string - tokenized and with sent tags
    """
    text_list = [txt for txt in text_list if len(txt.strip()) > 0]
    text_list = [' '.join(utils.word_tokenize(txt, tokenizer)) for txt in text_list]
    text = utils.sent_list_to_tagged_str(text_list)
    return text
示例#10
0
def build_word_vectors(infile_name, outfile_name):
    print('building word vectors...')
    pynlpir.open()
    jieba.initialize()

    df = pd.read_json(infile_name)
    with open(outfile_name, 'w') as f:
        for content in tqdm(df.article_content):
            f.write(' '.join(word_tokenize(content)))
示例#11
0
def score_tfidf_s(s, d):
    sent_words_lower_lemmatized = [
        lemmatizer.lemmatize(t) for t in word_tokenize(s, lower=True)
    ]
    sent_words_lower_lemmatized = [
        t for t in sent_words_lower_lemmatized
        if t not in stopwords.words('english')
    ]

    return mean([score_tfidf_w(w, d) for w in sent_words_lower_lemmatized])
示例#12
0
 def next_batch(self, batch_size, mode=TRAIN_MODE):
     review_lengths, reviews, targets = [], [], []
     data = self.train if mode == TRAIN_MODE else self.test
     batch = data.next_items(batch_size)
     for (review, target) in batch:
         review_length = len(word_tokenize(normalize_string(review)))
         review = indexes_from_sentence(review, self.dictionary,
                                        self.max_sent_len)
         target = one_hot_encoding(2, target)
         reviews.append(review)
         targets.append(target)
         review_lengths.append(review_length)
     return review_lengths, reviews, targets
示例#13
0
def tf(w, d):
    sents_in_doc = docsentdic[d]
    doc_words_lower_lemmatized = []
    for sent in sents_in_doc:
        doc_words_lower_lemmatized += [
            lemmatizer.lemmatize(t)
            for t in word_tokenize(sentwordrawdic[sent], lower=True)
        ]
    doc_words_lower_lemmatized = [
        t for t in doc_words_lower_lemmatized
        if t not in stopwords.words('english')
    ]

    return math.log(1 + (doc_words_lower_lemmatized.count(w) /
                         len(doc_words_lower_lemmatized)))
示例#14
0
def score_dt_s(s, d):
    doc_title_words = d.replace('_', ' ').replace('/', ' ').split()
    doc_title_words_lower = [t.lower() for t in doc_title_words]
    doc_title_words_lower_lemmatized = [
        lemmatizer.lemmatize(t) for t in doc_title_words_lower
    ]
    doc_title_words_lower_lemmatized = [
        t for t in doc_title_words_lower_lemmatized
        if t not in stopwords.words('english')
    ]
    sent_words = word_tokenize(s, lower=True)
    sent_words_lemmatized = [lemmatizer.lemmatize(t) for t in sent_words]

    return len([
        x
        for x in doc_title_words_lower_lemmatized if x in sent_words_lemmatized
    ]) / len(doc_title_words_lower_lemmatized)
示例#15
0
def normalize_corpus(corpus, lemmatize=False):
    # 输入文档列表,返回二维的列表,其中每个元素都是一个文档的词列表
    normalize_corpus = []
    text_list = [remove_special_characters(text)
                 for text in corpus]  # 去掉特殊符号的原始的英文文本列表,较好的输入
    for text in text_list:
        text = expand_contrations(text)
        if (lemmatize):
            pass
        else:
            text = text.lower()
        text = word_tokenize(text)
        normalize_corpus.append(text)
        #print(text)
    normalize_corpus = [
        remove_stopwords(text, 'stopwords.txt') for text in normalize_corpus
    ]
    return normalize_corpus
    def prep_input(self, input):
        # clean input
        article_list = [clean_str(x.strip()) for x in input]

        # tokenize
        x = [word_tokenize(d) for d in article_list]

        # replace with dictionary or unk
        x = [[self.word_dict.get(w, self.word_dict["<unk>"]) for w in d]
             for d in x]

        # trim as necessary
        x = [d[:self.article_max_len] for d in x]
        x = [
            d +
            (self.article_max_len - len(d)) * [self.word_dict["<padding>"]]
            for d in x
        ]

        return x
示例#17
0
def concatenate_data(squad_data_dir,
                     newsqa_data_dir,
                     out_dir,
                     env="train",
                     full_context=False):
    ext = ".context" if full_context else ".sentence"
    sentence_files = [
        os.path.join(squad_data_dir, env, env + ext),
        os.path.join(newsqa_data_dir, env, env + ext)
    ]
    question_files = [
        os.path.join(squad_data_dir, env, env + ".question"),
        os.path.join(newsqa_data_dir, env, env + ".question")
    ]
    out_sentence_filename = os.path.join(out_dir, env + ext)
    out_question_filename = os.path.join(out_dir, env + ".question")

    for infiles, outfile in zip(
        [sentence_files, question_files],
        [out_sentence_filename, out_question_filename]):
        with open(outfile, "w") as o:
            for f in infiles:
                with open(f) as infile:
                    for line in infile:
                        o.write(line)

    with open(out_sentence_filename, "r") as f,\
         open(out_question_filename, "r") as g:
        sentence_lines = f.readlines()
        question_lines = g.readlines()

    sentence_lines, question_lines = zip(
        *[(s, q) for s, q in sorted(zip(sentence_lines, question_lines),
                                    key=lambda x: len(word_tokenize(x[0])))])

    with open(out_sentence_filename, "w") as f,\
         open(out_question_filename, "w") as g:
        for line in sentence_lines:
            f.write(line)
        for line in question_lines:
            g.write(line)
示例#18
0
 def parse_sent(self, sent, fix_length):
     sent = [self.word_dict[w] if w in self.word_dict else 0 for w in utils.word_tokenize(sent)]
     sent, _ = self.pad_to_fix_len(sent, fix_length, padding_front=False)
     return sent
示例#19
0
def process_file(filename, data_type, word_counter, char_counter, ques_limit):
    """
    从文本文件中读取内容后进行初步处理。如果数据集是train的话,需要进行内容过滤
    :param filename:
    :param data_type:
    :param word_counter:
    :param char_counter:
    :return:
    """

    print("Processing {} examples...".format(data_type))
    examples = []
    eval_examples = {}
    total = 0

    with open(filename, "r") as fh:
        source = json.load(fh)
        # TODO 预处理中进行了过滤,但后续没有办法计算spans
        for article in tqdm(source):
            content = article['article_title'] + '。' + article[
                'article_content']
            content_tokens = word_tokenize(content)
            content_chars = [list(token) for token in content_tokens]
            spans = convert_idx(content, content_tokens)

            for token in content_tokens:
                word_counter[token] += len(article['questions'])
                for char in token:
                    char_counter[char] += len(article["questions"])

            for q in article['questions']:
                question_text = q["question"]
                answer_text = q['answer']
                question_tokens = word_tokenize(question_text)
                question_tokens = shrink_question_tokens(
                    question_tokens, ques_limit)

                question_chars = [list(token) for token in question_tokens]
                result = list(substring_indexes(answer_text, content))

                for token in question_tokens:
                    word_counter[token] += 1
                    for char in token:
                        char_counter[char] += 1

                if len(result) == 1:
                    # 将result的字符转换成分词之后的位置,y1 y2 分别是开始的分词位置和结束的位置
                    current_pos, start_token, end_token = 0, -1, -1
                    for token_cnt, token in enumerate(content_tokens):
                        if current_pos > result[0] and start_token == -1:
                            start_token = token_cnt - 1
                        if current_pos > result[0] + len(q["answer"]):
                            end_token = token_cnt - 2
                            break
                        current_pos += len(token)
                    total += 1
                    example = {
                        "context_tokens": content_tokens,
                        "context_chars": content_chars,
                        "ques_tokens": question_tokens,
                        "ques_chars": question_chars,
                        "y1s": [start_token],
                        "y2s": [end_token],
                        "id": total
                    }
                    eval_examples[str(total)] = {
                        "context": content,
                        "spans": spans,  # 全文的每个token与位置的对应关系
                        "answers": [answer_text],  # TODO 改成不分para的
                        "uuid": q["questions_id"]
                    }  # example中没有存储原始的问题文本信息,在这里保存了,在后续的结果展示中可以用到。
                    examples.append(example)  # 不考虑任何跨段的问题

        random.shuffle(examples)
        print("{} questions in total".format(len(examples)))
    return examples, eval_examples
示例#20
0
    def split_data(self, filename):
        self.load_data(filename)

        envs = ["train", "dev"]
        for sub_dir in envs:
            # create a subdirectory for Train and Dev data
            if not os.path.exists(os.path.join(self.data_dir, sub_dir)):
                os.makedirs(os.path.join(self.data_dir, sub_dir))

            with open(os.path.join(self.data_dir, sub_dir, sub_dir + ".context"), "w", encoding="utf-8") as context_file,\
                 open(os.path.join(self.data_dir, sub_dir, sub_dir + ".sentence"), "w", encoding="utf-8") as sentence_file,\
                 open(os.path.join(self.data_dir, sub_dir, sub_dir + ".question"), "w", encoding="utf-8") as question_file,\
                 open(os.path.join(self.data_dir, sub_dir, sub_dir + ".answer"), "w", encoding="utf-8") as answer_file:

                # loop over the data
                for article in tqdm.tqdm(self.data["data"]):
                    context = article["text"]
                    context_tokens = word_tokenize(context)
                    context_sentences = sent_tokenize(context)
                    if config.paragraph and (
                            len(context_tokens) < config.min_len_context
                            or len(context_tokens) > config.max_len_context):
                        continue
                    spans = convert_idx(context, context_tokens)
                    num_tokens = 0
                    first_token_sentence = []
                    for sentence in context_sentences:
                        first_token_sentence.append(num_tokens)
                        num_tokens += len(sentence)
                    if not article["type"] == sub_dir:
                        continue
                    for question in article["questions"]:
                        if question.get("isQuestionBad") == 0 and question[
                                "consensus"].get("s"):
                            q = question["q"].strip()
                            if q[-1] != "?" or len(q.split(
                            )) < config.min_len_question or len(
                                    q.split()) > config.max_len_question:
                                continue
                            answer_start = question["consensus"]["s"]
                            answer = context[question["consensus"]["s"]:
                                             question["consensus"]["e"]].strip(
                                                 ".| ").strip("\n")
                            answer_stop = answer_start + len(answer)

                            # Getting spans of the answer in the context
                            answer_span = []
                            for idx, span in enumerate(spans):
                                if not (answer_stop <= span[0]
                                        or answer_start >= span[1]):
                                    answer_span.append(idx)
                            if not answer_span:
                                continue

                            # Getting the sentence where we have the answer
                            sentence_tokens = []
                            for idx, start in enumerate(first_token_sentence):
                                if answer_span[0] >= start:
                                    sentence_tokens = context_sentences[idx]
                                    answer_sentence_span = [
                                        span - start for span in answer_span
                                    ]
                                else:
                                    break

                            # write to file
                            sent = []
                            for idx, token in enumerate(sentence_tokens):
                                if token.strip("\n").strip():
                                    if idx in answer_sentence_span:
                                        sent.append(token + u"│" + "1")
                                    else:
                                        sent.append(token + u"│" + "0")
                            sent = " ".join(sent)
                            sent = sent.strip()
                            index = sent.find("(│0 CNN│0 )│0 --│0 ")
                            if index > -1:
                                sent = sent[index +
                                            len("(│0 CNN│0 )│0 --│0 "):]

                            ctxt = []
                            for idx, token in enumerate(context_tokens):
                                if token.strip("\n").strip():
                                    if idx in answer_span:
                                        ctxt.append(token + u"│" + "1")
                                    else:
                                        ctxt.append(token + u"│" + "0")
                            ctxt = " ".join(ctxt)
                            ctxt = ctxt.strip()
                            index = ctxt.find("(│0 CNN│0 )│0 --│0 ")
                            if index > -1:
                                ctxt = ctxt[index +
                                            len("(│0 CNN│0 )│0 --│0 "):]

                            context_file.write(ctxt + "\n")
                            sentence_file.write(sent + "\n")
                            question_file.write(q + "\n")
                            answer_file.write(answer + "\n")
示例#21
0
    def tokenize_word():
        print('-- tokenize words')
        words = [word_tokenize(data) for data in raw_data + test_data]
        labels = [label for label in raw_labels + test_labels]

        return words, labels
                    '-i',
                    required=True,
                    help='input preprocessed text file')
parser.add_argument('--output', '-o', required=True, help='output file name')
args = parser.parse_args()

title_text_pair = []

with open(args.input, 'r') as f:
    lines = f.read().splitlines()
    title = []
    text = []
    this_title = ''
    this_text = ''
    for line in lines:
        if len(word_tokenize(line)) < 30 and line[-1] != '.':
            this_title = line
        else:
            this_text = line
            title_text_pair.append((this_title, this_text))

with open(args.output, 'w') as f:
    titles = []
    for title, text in title_text_pair:
        title = title.replace(' ', '_')
        title = title.replace(',', '')
        if title in titles:
            title = title + '_'
        titles.append(title)
        f.write(title + '\n')
        sents = [s for s in sent_tokenize(text) if len(word_tokenize(s)) >= 5]
示例#23
0
    def predict(self, raw_sentences):
        if type(raw_sentences) is not list: raw_sentences = [raw_sentences]

        if self.params.get('convert_slang', True):
            split_sentences = [
                sentence.split(' ') for sentence in raw_sentences
            ]
            convert_sentences = [[
                self.slang_dict[_word] if _word in self.slang_dict else _word
                for _word in word
            ] for word in split_sentences]

            raw_sentences = [
                clean_str(' '.join(sentence)) for sentence in convert_sentences
            ]

        with tf.device('/cpu:0'):
            sess, word2id, label2id = self.model.sess, self.word_vocab[
                'word2id'], self.label_vocab['label2id']

        # tokenize word and create faked labels
        sentences = [word_tokenize(sentence) for sentence in raw_sentences]
        labels = [self.label_vocab['id2label'][0]] * len(
            sentences)  # just faked label

        dataset = create_dataset(sentences, labels, word2id, label2id)

        #
        if self.params.get('sentiment_lexicon', False):
            lexicons = train.convert_sentiment_lexicon(
                self.lexicon_vocab['lexicon2id'], sentences)
            dataset = add_to_dataset(dataset, lexicons,
                                     self.lexicon_vocab['lexicon2id'],
                                     'lexicon_ids')

        #
        if self.params.get('pos', False):
            pos = train.convert_pos(self.pos_vocab['pos2id'], sentences)
            dataset = add_to_dataset(dataset, pos, self.pos_vocab['pos2id'],
                                     'pos_ids')

        #
        if self.params.get('vader_lexicon', False):
            vaderlexicons = train.convert_vader_lexicon(
                self.vader_lexicon_vocab['vaderlexicon2id'], sentences)
            dataset = add_to_dataset(
                dataset, vaderlexicons,
                self.vader_lexicon_vocab['vaderlexicon2id'],
                'vader_lexicon_ids')

        # prepare input feed dict
        ip_feed_dict = self.model.create_input(dataset)
        for k, v in self.model.create_additional_input(dataset).items():
            ip_feed_dict[k] = v

        if hasattr(self.model, 'dropout'):
            ip_feed_dict[self.model.dropout] = 1.0

        if hasattr(self.model, 'is_training'):
            ip_feed_dict[self.model.is_training] = False

        predict, predict_proba = sess.run(
            [self.model.logits,
             tf.nn.softmax(self.model.logits)],
            feed_dict=ip_feed_dict)

        predict_ids = np.argmax(predict, axis=1)
        return \
            [
                ((
                    self.label_vocab['id2label'][predict_id],
                    predict_proba[id][predict_id],
                    {self.label_vocab['id2label'][i]: p for i, p in enumerate(predict_proba[id])}
                ))
                for id, predict_id in enumerate(predict_ids)
            ]
示例#24
0
    def make_dataset(self, train_path, test_path, is_convert_slang,
                     is_sentiment_lexicon, is_pos, is_vader_lexicon):
        # load file
        raw_data, raw_labels = load_train_file(train_path)
        test_data, test_labels = load_train_file(test_path)
        len_train = len(raw_data)

        print('-- tokenize words')
        words, labels = [word_tokenize(data) for data in raw_data + test_data
                         ], [label for label in raw_labels + test_labels]

        #
        # convert slang into its corresponding word (maybe not?)
        #
        if is_convert_slang:
            slang_path = os.path.dirname(os.path.realpath(
                __file__)) + '/data/preprocess/slang/slang.pkl'
            slang_dict = load_dict(slang_path)

            words = train.convert_slang(slang_dict, words)

        #
        # split train and test.
        #
        print('-- split train and test data')
        train_words, train_labels, test_words, test_labels = words[:len_train], labels[:len_train], words[
            len_train:], labels[len_train:]

        #
        # create dataset
        #
        train_dataset = create_dataset(train_words, train_labels,
                                       self.word_vocab['word2id'],
                                       self.label_vocab['label2id'])
        test_dataset = create_dataset(test_words, test_labels,
                                      self.word_vocab['word2id'],
                                      self.label_vocab['label2id'])

        #
        # add sentiment_lexicon as additional features (maybe not?)
        #
        if is_sentiment_lexicon:
            lexicon_path = os.path.dirname(os.path.realpath(
                __file__)) + '/data/preprocess/lexicon/lexicon.pkl'
            lexicon_dct = load_dict(lexicon_path)

            lexicons = train.convert_sentiment_lexicon(lexicon_dct, words,
                                                       'neu')
            train_lexicons, test_lexicons = lexicons[:len_train], lexicons[
                len_train:]

            train_dataset = add_to_dataset(train_dataset, train_lexicons,
                                           self.lexicon_vocab['lexicon2id'],
                                           'lexicon_ids')
            test_dataset = add_to_dataset(test_dataset, test_lexicons,
                                          self.lexicon_vocab['lexicon2id'],
                                          'lexicon_ids')

        #
        # add pos (part-of-speech) as additional features (maybe not?)
        #
        if is_pos:
            pos = train.convert_pos(words)
            train_pos, test_pos = pos[:len_train], pos[len_train:]

            train_dataset = add_to_dataset(train_dataset, train_pos,
                                           self.pos_vocab['pos2id'], 'pos_ids')
            test_dataset = add_to_dataset(test_dataset, test_pos,
                                          self.pos_vocab['pos2id'], 'pos_ids')

        #
        # add vader_lexicon as additional features (maybe not?)
        #
        if is_vader_lexicon:
            vaderlexicons = train.convert_vader_lexicon(
                self.vader_lexicon_vocab['vaderlexicon2id'], words)
            train_vaderlexicons, test_vaderlexicons = vaderlexicons[:len_train], vaderlexicons[
                len_train:]

            train_dataset = add_to_dataset(
                train_dataset, train_vaderlexicons,
                self.vader_lexicon_vocab['vaderlexicon2id'],
                'vader_lexicon_ids')
            test_dataset = add_to_dataset(
                test_dataset, test_vaderlexicons,
                self.vader_lexicon_vocab['vaderlexicon2id'],
                'vader_lexicon_ids')

        return train_dataset, test_dataset
示例#25
0
def read_news(news_path, args, mode='train'):
    news = {}
    categories = []
    subcategories = []
    domains = []
    news_index = {}
    index = 1
    word_cnt = Counter()

    with tf.io.gfile.GFile(news_path, "r") as f:
        for line in tqdm(f):
            splited = line.strip('\n').split('\t')
            doc_id, category, subcategory, title, abstract, url, _, _ = splited
            body = ""
            news_index[doc_id] = index
            index += 1

            if 'title' in args.news_attributes:
                title = title.lower()
                title = word_tokenize(title)
            else:
                title = []

            if 'abstract' in args.news_attributes:
                abstract = abstract.lower()
                abstract = word_tokenize(abstract)
            else:
                abstract = []

            if 'body' in args.news_attributes:
                body = body.lower()[:2000]
                body = word_tokenize(body)
            else:
                body = []

            if 'category' in args.news_attributes:
                categories.append(category)
            else:
                category = None
            
            if 'subcategory' in args.news_attributes:
                subcategories.append(subcategory)
            else:
                subcategory = None

            if 'domain' in args.news_attributes:
                domain = get_domain(url)
                domains.append(domain)
            else:
                domain = None

            news[doc_id] = [title, abstract, body, category, domain, subcategory]
            if mode == 'train':
                word_cnt.update(title + abstract + body)

    if mode == 'train':
        word = [k for k, v in word_cnt.items() if v > args.filter_num]
        word_dict = {k: v for k, v in zip(word, range(1, len(word) + 1))}
        categories = list(set(categories))
        category_dict = {}
        index = 1
        for x in categories:
            category_dict[x] = index
            index += 1

        subcategories = list(set(subcategories))
        subcategory_dict = {}
        index = 1
        for x in subcategories:
            subcategory_dict[x] = index
            index += 1

        domains = list(set(domains))
        domain_dict = {}
        index = 1
        for x in domains:
            domain_dict[x] = index
            index += 1

        return news, news_index, category_dict, word_dict, domain_dict, subcategory_dict
    elif mode == 'test':
        return news, news_index
    else:
        assert False, 'Wrong mode!'
示例#26
0
        doc_title_words_lower = [t.lower() for t in doc_title_words]
        doc_title_words_lower_lemmatized = [
            lemmatizer.lemmatize(t) for t in doc_title_words_lower
        ]
        doc_title_words_lower_lemmatized = [
            t for t in doc_title_words_lower_lemmatized
            if t not in stopwords.words('english')
        ]
        sents = docsentdic[doc]
        for sent in sents:
            sent_words = sentworddic[sent]
            sent_words_lemmatized = [
                lemmatizer.lemmatize(t) for t in sent_words
            ]
            # Check sentence length
            words = word_tokenize(sentwordrawdic[sent])
            if len(words) < SENTENCE_MIN_LENGTH:
                continue
            if list(set(hand_stopwords).intersection(set(sent_words))):
                continue
            # Check whether index term or document title word is in the sentence
            if (not list(
                    set(index_term_lower_lemmatized).intersection(
                        set(sent_words_lemmatized)))) and (not list(
                            set(doc_title_words_lower_lemmatized).intersection(
                                set(sent_words_lemmatized)))):
                continue
            quiz_doc_sent_tuple.append((doc, sent))

    logging.info("Performing gap search...")
示例#27
0
    def split_data(self, filename):
        self.load_data(filename)
        sub_dir = filename.split('-')[0]

        # create a subdirectory for Train and Dev data
        if not os.path.exists(os.path.join(self.data_dir, sub_dir)):
            os.makedirs(os.path.join(self.data_dir, sub_dir))

        with open(os.path.join(self.data_dir, sub_dir, sub_dir + '.context'), 'w', encoding="utf-8") as context_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.sentence'), 'w', encoding="utf-8") as sentence_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.question'), 'w', encoding="utf-8") as question_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.answer'), 'w', encoding="utf-8") as answer_file:

            # loop over the data
            for article_id in tqdm.tqdm(range(len(self.data['data']))):
                list_paragraphs = self.data['data'][article_id]['paragraphs']
                # loop over the paragraphs
                for paragraph in list_paragraphs:
                    context = paragraph['context']
                    context = clean_text(context)
                    context_tokens = word_tokenize(context)
                    if config.paragraph and (
                            len(context_tokens) < config.min_len_context
                            or len(context_tokens) > config.max_len_context):
                        continue
                    context_sentences = sent_tokenize(context)
                    spans = convert_idx(context, context_tokens)
                    num_tokens = 0
                    first_token_sentence = []
                    for sentence in context_sentences:
                        first_token_sentence.append(num_tokens)
                        num_tokens += len(sentence)
                    qas = paragraph['qas']
                    # loop over Q/A
                    for qa in qas:
                        question = qa['question']
                        question = clean_text(question)
                        question_tokens = word_tokenize(question)
                        if question_tokens[-1] != "?" or len(
                                question_tokens
                        ) < config.min_len_question or len(
                                question_tokens) > config.max_len_question:
                            continue
                        if sub_dir == "train":
                            # select only one ground truth, the top answer, if any answer
                            answer_ids = 1 if qa['answers'] else 0
                        else:
                            answer_ids = len(qa['answers'])
                        if answer_ids:
                            for answer_id in range(answer_ids):
                                answer = qa['answers'][answer_id]['text']
                                answer = clean_text(answer)
                                answer_tokens = word_tokenize(answer)
                                answer_start = qa['answers'][answer_id][
                                    'answer_start']
                                answer_stop = answer_start + len(answer)

                                # Getting spans of the answer in the context
                                answer_span = []
                                for idx, span in enumerate(spans):
                                    if not (answer_stop <= span[0]
                                            or answer_start >= span[1]):
                                        answer_span.append(idx)
                                if not answer_span:
                                    continue

                                # Getting the sentence where we have the answer
                                sentence_tokens = []
                                for idx, start in enumerate(
                                        first_token_sentence):
                                    if answer_span[0] >= start:
                                        sentence_tokens = context_sentences[
                                            idx]
                                        answer_sentence_span = [
                                            span - start
                                            for span in answer_span
                                        ]
                                    else:
                                        break
                                if not sentence_tokens:
                                    print("Sentence cannot be found")
                                    raise Exception()

                            # write to file
                            context_file.write(" ".join([
                                token + u"│" +
                                "1" if idx in answer_span else token + u"│" +
                                "0" for idx, token in enumerate(context_tokens)
                            ]) + "\n")
                            sentence_file.write(" ".join([
                                token + u"│" +
                                "1" if idx in answer_sentence_span else token +
                                u"│" + "0"
                                for idx, token in enumerate(sentence_tokens)
                            ]) + "\n")
                            question_file.write(
                                " ".join([token
                                          for token in question_tokens]) +
                                "\n")
                            answer_file.write(
                                " ".join([token
                                          for token in answer_tokens]) + "\n")
示例#28
0
]
index_term_lower_lemmatized = [
    t for t in index_term_lower_lemmatized
    if t not in stopwords.words('english')
]

logging.info("Constructing document words...")
docwords = {}

for doc in docsentdic:
    sents_in_doc = docsentdic[doc]
    doc_words_lower_lemmatized = []
    for sent in sents_in_doc:
        doc_words_lower_lemmatized += [
            lemmatizer.lemmatize(t)
            for t in word_tokenize(sentwordrawdic[sent], lower=True)
        ]
    doc_words_lower_lemmatized = [
        t for t in doc_words_lower_lemmatized
        if t not in stopwords.words('english')
    ]
    docwords[doc] = doc_words_lower_lemmatized


def score_dt_s(s, d):
    doc_title_words = d.replace('_', ' ').replace('/', ' ').split()
    doc_title_words_lower = [t.lower() for t in doc_title_words]
    doc_title_words_lower_lemmatized = [
        lemmatizer.lemmatize(t) for t in doc_title_words_lower
    ]
    doc_title_words_lower_lemmatized = [
示例#29
0
    def split_sentence_question(self, filename, data_type):
        data = self.load_data(filename)
        with open(os.path.join(self.save_dir + data_type + '.sentence'), 'w', encoding="utf-8") as sentence_file,\
             open(os.path.join(self.save_dir + data_type + '.question'), 'w', encoding="utf-8") as question_file:

            artilces = data
            for article in tqdm(artilces):
                paragraphs = article['paragraphs']
                for paragraph in paragraphs:
                    context = paragraph['context']

                    context = clean_text(context)
                    context_tokens = word_tokenize(context)
                    context_sentences = sent_tokenize(context)

                    spans = convert_idx(context, context_tokens)
                    num_tokens = 0
                    first_token_sentence = []
                    for sentence in context_sentences:
                        first_token_sentence.append(num_tokens)
                        num_tokens += len(sentence)

                    question_and_answer_list = paragraph['qas']
                    for question_and_answer in question_and_answer_list:
                        question = question_and_answer['question']
                        question = clean_text(question)
                        question_tokens = word_tokenize(question)

                        if len(question_tokens) > MAX_QUESTION_LENGTH or len(
                                question_tokens) < MIN_QUESTION_LENGHT:
                            continue

                        if not question_and_answer['answers']: continue
                        answer = question_and_answer['answers'][0]
                        answer_text = answer['text']
                        answer_text = clean_text(answer_text)
                        answer_tokens = word_tokenize(answer_text)
                        answer_start = answer['answer_start']
                        answer_stop = answer_start + len(answer_text)

                        answer_span = []
                        for idx, span in enumerate(spans):
                            if not (answer_stop <= span[0]
                                    or answer_start >= span[1]):
                                answer_span.append(idx)
                        if not answer_span: continue

                        sentence_tokens = []
                        for idx, start in enumerate(first_token_sentence):
                            if answer_span[0] >= start:
                                sentence_tokens = context_sentences[idx]
                                answer_sentence_span = [
                                    span - start for span in answer_span
                                ]
                            else:
                                break
                        if not sentence_tokens:
                            print("Sentence cannot be found")
                            raise Exception()

                        if len(sentence_tokens) > MAX_SENTENCE_LENGTH or len(
                                sentence_tokens) < MIN_SENTENCE_LENGTH:
                            continue

                        sentence_file.write(" ".join([
                            token + u"│" +
                            "1" if idx in answer_sentence_span else token +
                            u"│" + "0"
                            for idx, token in enumerate(sentence_tokens)
                        ]) + "\n")
                        question_file.write(
                            " ".join([token
                                      for token in question_tokens]) + "\n")
    def extract_features(self,
                         max_len_context=config.max_len_context,
                         max_len_question=config.max_len_question,
                         max_len_word=config.max_len_word,
                         is_train=True):
        # choose the right directory
        directory = "train" if is_train else "dev"

        # load context
        with open(os.path.join(self.data_dir, directory,
                               directory + ".context"),
                  "r",
                  encoding="utf-8") as c:
            context = c.readlines()
        # load questions
        with open(os.path.join(self.data_dir, directory,
                               directory + ".question"),
                  "r",
                  encoding="utf-8") as q:
            question = q.readlines()
        # load answer
        with open(os.path.join(self.data_dir, directory,
                               directory + ".labels"),
                  "r",
                  encoding="utf-8") as l:
            labels = l.readlines()

        # clean and tokenize context and question
        context = [[w for w in word_tokenize(clean_text(doc.strip('\n')))]
                   for doc in context]
        question = [[w for w in word_tokenize(clean_text(doc.strip('\n')))]
                    for doc in question]

        # download vocabulary if not done yet
        if directory == "train":
            labels = [
                np.array(l.strip("\n").split(), dtype=np.int32) for l in labels
            ]

            word_vocab, word2idx, char_vocab, char2idx = build_vocab(
                directory + ".context",
                directory + ".question",
                "word_vocab.pkl",
                "word2idx.pkl",
                "char_vocab.pkl",
                "char2idx.pkl",
                is_train=is_train,
                max_words=config.max_words)
            # create an embedding matrix from the vocabulary with pretrained vectors (GloVe) for words
            build_embeddings(word_vocab,
                             embedding_path=config.glove,
                             output_path="word_embeddings.pkl",
                             vec_size=config.word_embedding_size)
            build_embeddings(char_vocab,
                             embedding_path="",
                             output_path="char_embeddings.pkl",
                             vec_size=config.char_embedding_size)

        else:
            labels = np.array([l.strip("\n") for l in labels])

            with open(os.path.join(self.data_dir, "train", "word2idx.pkl"), "rb") as wi,\
                 open(os.path.join(self.data_dir, "train", "char2idx.pkl"), "rb") as ci:
                word2idx = pickle.load(wi)
                char2idx = pickle.load(ci)

        print("Number of questions before filtering:", len(question))
        filter = [
            len(c) < max_len_context and max([len(w)
                                              for w in c]) < max_len_word
            and len(q) < max_len_question
            and max([len(w) for w in q]) < max_len_word and len(q) > 3
            for c, q in zip(context, question)
        ]
        context, question, labels = zip(
            *[(c, q, l)
              for c, q, l, f in zip(context, question, labels, filter) if f])
        print("Number of questions after filtering ", len(question))

        # replace the tokenized words with their associated ID in the vocabulary
        context_idxs = []
        context_char_idxs = []
        question_idxs = []
        question_char_idxs = []
        for i, (c, q) in tqdm.tqdm(enumerate(zip(context, question))):
            # create empty numpy arrays
            context_idx = np.zeros([max_len_context], dtype=np.int32)
            question_idx = np.zeros([max_len_question], dtype=np.int32)
            context_char_idx = np.zeros([max_len_context, max_len_word],
                                        dtype=np.int32)
            question_char_idx = np.zeros([max_len_question, max_len_word],
                                         dtype=np.int32)

            # replace 0 values with word and char IDs
            for j, word in enumerate(c):
                if word in word2idx:
                    context_idx[j] = word2idx[word]
                else:
                    context_idx[j] = 1
                for k, char in enumerate(word):
                    if char in char2idx:
                        context_char_idx[j, k] = char2idx[char]
                    else:
                        context_char_idx[j, k] = 1
            context_idxs.append(context_idx)
            context_char_idxs.append(context_char_idx)

            for j, word in enumerate(q):
                if word in word2idx:
                    question_idx[j] = word2idx[word]
                else:
                    question_idx[j] = 1
                for k, char in enumerate(word):
                    if char in char2idx:
                        question_char_idx[j, k] = char2idx[char]
                    else:
                        question_char_idx[j, k] = 1
            question_idxs.append(question_idx)
            question_char_idxs.append(question_char_idx)
        # save features as numpy arrays
        np.savez(os.path.join(self.data_dir, directory,
                              directory + "_features"),
                 context_idxs=np.array(context_idxs),
                 context_char_idxs=np.array(context_char_idxs),
                 question_idxs=np.array(question_idxs),
                 question_char_idxs=np.array(question_char_idxs),
                 label=np.array(labels))
示例#31
0
def eval(context, question):
    with open(os.path.join(config.data_dir, "train", "word2idx.pkl"), "rb") as wi, \
         open(os.path.join(config.data_dir, "train", "char2idx.pkl"), "rb") as ci, \
         open(os.path.join(config.data_dir, "train", "word_embeddings.pkl"), "rb") as wb, \
         open(os.path.join(config.data_dir, "train", "char_embeddings.pkl"), "rb") as cb:
        word2idx = pickle.load(wi)
        char2idx = pickle.load(ci)
        word_embedding_matrix = pickle.load(wb)
        char_embedding_matrix = pickle.load(cb)

    # transform them into Tensors
    word_embedding_matrix = torch.from_numpy(
        np.array(word_embedding_matrix)).type(torch.float32)
    char_embedding_matrix = torch.from_numpy(
        np.array(char_embedding_matrix)).type(torch.float32)
    idx2word = dict([(y, x) for x, y in word2idx.items()])

    context = clean_text(context)
    context = [w for w in word_tokenize(context) if w]

    question = clean_text(question)
    question = [w for w in word_tokenize(question) if w]

    if len(context) > config.max_len_context:
        print("The context is too long. Maximum accepted length is",
              config.max_len_context, "words.")
    if max([len(w) for w in context]) > config.max_len_word:
        print("Some words in the context are longer than", config.max_len_word,
              "characters.")
    if len(question) > config.max_len_question:
        print("The question is too long. Maximum accepted length is",
              config.max_len_question, "words.")
    if max([len(w) for w in question]) > config.max_len_word:
        print("Some words in the question are longer than",
              config.max_len_word, "characters.")
    if len(question) < 3:
        print(
            "The question is too short. It needs to be at least a three words question."
        )

    context_idx = np.zeros([config.max_len_context], dtype=np.int32)
    question_idx = np.zeros([config.max_len_question], dtype=np.int32)
    context_char_idx = np.zeros([config.max_len_context, config.max_len_word],
                                dtype=np.int32)
    question_char_idx = np.zeros(
        [config.max_len_question, config.max_len_word], dtype=np.int32)

    # replace 0 values with word and char IDs
    for j, word in enumerate(context):
        if word in word2idx:
            context_idx[j] = word2idx[word]
        else:
            context_idx[j] = 1
        for k, char in enumerate(word):
            if char in char2idx:
                context_char_idx[j, k] = char2idx[char]
            else:
                context_char_idx[j, k] = 1

    for j, word in enumerate(question):
        if word in word2idx:
            question_idx[j] = word2idx[word]
        else:
            question_idx[j] = 1
        for k, char in enumerate(word):
            if char in char2idx:
                question_char_idx[j, k] = char2idx[char]
            else:
                question_char_idx[j, k] = 1

    model = BiDAF(word_vectors=word_embedding_matrix,
                  char_vectors=char_embedding_matrix,
                  hidden_size=config.hidden_size,
                  drop_prob=config.drop_prob)
    try:
        if config.cuda:
            model.load_state_dict(
                torch.load(os.path.join(config.squad_models,
                                        "model_final.pkl"))["state_dict"])
        else:
            model.load_state_dict(
                torch.load(
                    os.path.join(config.squad_models, "model_final.pkl"),
                    map_location=lambda storage, loc: storage)["state_dict"])
        print("Model weights successfully loaded.")
    except:
        pass
        print(
            "Model weights not found, initialized model with random weights.")
    model.to(device)
    model.eval()
    with torch.no_grad():
        context_idx, context_char_idx, question_idx, question_char_idx = torch.tensor(context_idx, dtype=torch.int64).unsqueeze(0).to(device),\
                                                                         torch.tensor(context_char_idx, dtype=torch.int64).unsqueeze(0).to(device),\
                                                                         torch.tensor(question_idx, dtype=torch.int64).unsqueeze(0).to(device),\
                                                                         torch.tensor(question_char_idx, dtype=torch.int64).unsqueeze(0).to(device)

        pred1, pred2 = model(context_idx, context_char_idx, question_idx,
                             question_char_idx)
        starts, ends = discretize(pred1.exp(), pred2.exp(), 15, False)
        prediction = " ".join(context[starts.item():ends.item() + 1])

    return prediction