Exemplo n.º 1
0
def get_data(source_path, target_path, source_embedding_path, target_embedding_path):
    # source_word, target_word = set(), set()
    ## filter word need more count
    source_word_dict, target_word_dict = defaultdict(int), defaultdict(int)
    # source_path = "/home/FuDawei/NLP/Machine_Translation/dataset/datum2017/Book1_en.txt"
    # target_path = "/home/FuDawei/NLP/Machine_Translation/dataset/datum2017/Book1_cn.txt"
    get_wordset(source_path, source_word_dict)
    # get_wordset(source_dev_path, source_word_dict)
    # get_wordset(target_train_path, target_word_dict)
    get_wordset(target_path, target_word_dict)

    # source_thresh_cnt = 1
    # target_thresh_cnt = 8
    # source_word = list(filter(lambda x: source_word_dict[x]>source_thresh_cnt, source_word_dict.keys()))
    # target_word = list(filter(lambda x: target_word_dict[x]>target_thresh_cnt, target_word_dict.keys()))
    source_word = list(source_word_dict.keys())
    target_word = list(target_word_dict.keys())

    save_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/"

    # path, Word, glove_dim, ignore_head, debug=True
    target_embedding, target_word2id, target_id2word = get_embedding(target_embedding_path, target_word, 300, 1)
    print("target "+str(len(target_word2id)))
    source_embedding, source_word2id, source_id2word = get_embedding(source_embedding_path, source_word, 300, 0)
    print("source "+str(len(source_word2id)))
    
    dump_data(source_embedding, save_dir+"source_embedding.json")
    dump_data(source_word2id, save_dir+"source_word2id.json")
    dump_data(source_id2word, save_dir+"source_id2word.json")

    dump_data(target_embedding, save_dir+"target_embedding.json")
    dump_data(target_word2id, save_dir+"target_word2id.json")
    dump_data(target_id2word, save_dir+"target_id2word.json")
Exemplo n.º 2
0
def get_if_duplicate(sentence1, sentence2):
    is_duplicate = 0
    try:
        global model
        if sentence1 and sentence2:
            sentence1 = tokenize_sent(str(sentence1).lower())
            sentence2 = tokenize_sent(str(sentence2).lower())
            sentence1 = [get_embedding(w) for w in sentence1]
            sentence2 = [get_embedding(w) for w in sentence2]

            len1 = len(sentence1)
            len2 = len(sentence2)


            results = model.sess.run(
                        model.is_duplicate,
                        feed_dict = model.get_feed_dict(
                                        [sentence1],
                                        [sentence2],
                                        [len1],
                                        [len2],
                                        None
                    )
            )
            is_duplicate = results[0]

    except Exception as e:
        print(str(e))
    return dup_dict[is_duplicate]
Exemplo n.º 3
0
 def __init__(self, params):
     super(PGN, self).__init__()
     word_model_path = os.path.join(os.path.abspath('../'), 'data',
                                    'w2v.model')
     vocab_path = os.path.join(os.path.abspath('../'), 'data',
                               'words_frequences.txt')
     self.params = params
     self.matrix = get_embedding(vocab_path, word_model_path, params)
     self.encoder = Encoder(params["vocab_size"], params["embed_size"],
                            self.matrix, params["enc_units"],
                            params["batch_size"])
     self.attention = BahdanauAttention(params["attn_units"])
     self.decoder = Decoder(params["vocab_size"], params["embed_size"],
                            self.matrix, params["dec_units"],
                            params["batch_size"])
     self.pointer = Pointer()
Exemplo n.º 4
0
def get_similar_documents(query: str, count: int) -> List[Dict[str, str]]:
    """
    get similar documents
    """
    index = config.get_es_index()
    embedded_query = get_embedding(query)
    knn_query = {"size": count, "query": {"knn": {"embedding": {"vector": embedded_query, "k": count}}}}

    results = es_handler.search(index=index, body=knn_query)["hits"]["hits"]

    documents = []
    for res in results:
        doc = {"id": res["_id"], "score": res["_score"]}
        source = res["_source"]
        source.pop("embedding")
        doc.update(source)
        documents.append(doc)

    return documents
Exemplo n.º 5
0
def index_page(pageid: int) -> None:
    """
    add page to index
    - get document from s3
    - get embedding from document content
    - prepare es document
    - index es document
    """
    index_name = config.get_es_index()
    s3_bucket = config.get_s3_bucket()
    s3_prefix = config.get_s3_prefix()
    s3_file_uri = f"S3://{s3_bucket}/{s3_prefix}/{pageid}"
    with smart_open(s3_file_uri, "r") as fp:
        page = json.load(fp)

    page_id = page.pop("pageid")
    content = page.pop("content")
    page["uri"] = s3_file_uri
    page["embedding"] = get_embedding(content)

    es_handler.index(index_name, body=page, id=page_id)
Exemplo n.º 6
0
def read_corpus(random, max_len):
    vocab, word2id, embedding = get_embedding(random, 300)
    sentsid_, sents_, tags_ = [], [], []
    logging.info("开始读取数据集")
    with open(config.split_data, encoding='utf-8') as fr:
        lines = fr.readlines()
    sentid_, sent_, tag_ = [], [], []
    for line in lines:
        if line != '\n':
            char, label = line.strip().split()
            tag_.append(tag2label[label])
            if char.startswith("num"):
                sent_.append("num")
                sentid_.append(1)
            elif char.startswith("en"):
                sent_.append("en")
                sentid_.append(2)
            elif '\u4e00' <= char <= '\u9fa5' and char in vocab:
                sent_.append(char)
                sentid_.append(word2id[char])
            else:
                sent_.append("unk")
                sentid_.append(0)
        else:
            if 3 < len(sent_) <= max_len:
                sents_.append(sent_)
                tags_.append(tag_)
                sentsid_.append(sentid_)
                sentid_, sent_, tag_ = [], [], []
            else:
                sentid_, sent_, tag_ = [], [], []
    # 在get_feed_dict去padding,不事先padding好了
    # padding_tags = tflearn.data_utils.pad_sequences(tags_, maxlen=max_len, value=3)
    # padding_sentsid = tflearn.data_utils.pad_sequences(sentsid_, maxlen=max_len, value=0)
    # print(sents_[0])
    # print(padding_sentsid[0])
    # print(padding_tags[0])
    return sentsid_, sents_, tags_
Exemplo n.º 7
0
        gru_out = tf.concat([tf.squeeze(context, 1),
                             tf.squeeze(gru_out, 1)], 1)
        gru_out = self.wc(gru_out)
        logits = self.ws(gru_out)

        return logits, state_h, state_c, aligment

    def init_states(self, batch_size):
        return (tf.zeros([batch_size, self.gru_size]),
                tf.zeros([batch_size, self.gru_size]))


from embedding import get_embedding

embedding_matrix1, embedding_matrix2, input_tensor, target_tensor, tokenizer1, tokenizer2 = get_embedding(
)

BUFFER_SIZE = len(input_tensor)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor) // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(input_tensor) + 1
vocab_tar_size = len(target_tensor) + 1

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)


def data_loader(input_tensor, target_tensor):
    dataset = tf.data.Dataset.from_tensor_slices(
        (input_tensor, target_tensor)).shuffle(len(input_tensor))
Exemplo n.º 8
0
                    default='predict',
                    help='三种模式:train/test/predict')
parser.add_argument('--embedding_random',
                    type=str,
                    default=True,
                    help='使用随机的字嵌入(True)还是已经预训练好的(False),默认使用随机')
parser.add_argument('--update_embedding',
                    type=str2bool,
                    default=True,
                    help='默认训练')

args = parser.parse_args()

train_data, test_data = get_train_test_data(args.embedding_random,
                                            args.max_len)
vocab, word2id, embeddings = get_embedding(args.embedding_random,
                                           args.embedding_dim)

configs = tf.ConfigProto()
configs.gpu_options.allow_growth = True
configs.gpu_options.per_process_gpu_memory_fraction = 0.2
# paths setting
paths = {}
output_path = config.output_path
if not os.path.exists(output_path):
    os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
paths['summary_path'] = summary_path
if not os.path.exists(summary_path):
    os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
if not os.path.exists(model_path):
Exemplo n.º 9
0
    return train_dataloader, val_dataloader


if __name__ == '__main__':
    # vocab_size = 10000
    # vocabulary = get_all_vocabulary(train_file_path='dataset/train.csv', vocab_size=vocab_size)
    # assert isinstance(vocabulary, list)
    # assert isinstance(vocabulary[0], str)
    # assert len(vocabulary) <= vocab_size
    #
    f = open('dataset/vocabulary.txt', 'r')
    vocabulary = f.readlines()
    vocabulary = [v.strip() for v in vocabulary]

    embedding, token2id, vocab_size = get_embedding(set(vocabulary))

    X_train, y_train, X_val, y_val, label2id, id2label = get_train_data(
        'dataset/train.csv', vocab2ids=token2id)

    print(X_train, y_train, X_val, y_val, label2id, id2label)

    train_loader, val_loader = build_dataloader(X_train,
                                                y_train,
                                                X_val,
                                                y_val,
                                                batch_size=128)

    for i, (x, y) in enumerate(train_loader):
        ic(x)
        ic(y)
Exemplo n.º 10
0
    def forward(self, input_ids=None):
        word_embeddings = self.embedding(input_ids)
        sentence_embedding = word_embeddings.unsqueeze(1)

        out = torch.cat([self.conv_and_pool(sentence_embedding, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc(out)

        outputs = (out, )

        return outputs


if __name__ == '__main__':
    some_text_sentence = '今天股市大跌'
    words = list(jieba.cut(some_text_sentence))
    embedding, token2id, _ = get_embedding(set(words))

    text_cnn_model = TextCNN(embedding, each_filter_num=128, filter_heights=[2, 3, 5], drop_out=0.3,
                             num_classes=15)

    ids =[token2id[w] for w in words]

    some_text_sentence = '测试一个新句子'
    words = list(jieba.cut(some_text_sentence))
    embedding, token2id, _ = get_embedding(set(words))

    # out = text_cnn_model(torch.tensor([ids]))

    # print(out)