def dataPreparation(file):
    """
        Prepares the Data given a file for Machine Translation Task  
    """
    tsvFileName = os.path.join("../processedData/Fact2_Only_F1_H_exact_tokens/", os.path.basename(file).split(".")[0]+"Seq2seq.tsv")
    print (tsvFileName)
    tsvFile = open(tsvFileName, 'w', newline='\n')
    writer = csv.writer(tsvFile, delimiter='\t')

    with open(file, 'r') as f:
        reader = csv.reader(f)
        data = list(reader)
    header = data[0]

    fact2ProcessedTokens = []
    
    for each in data[1:]:
        id = each[header.index('id')]
        fact1 = each[header.index('fact1')]
        fact2 = each[header.index('Fact2')]
        ans = each[header.index('answerKey')]
        ansHypo = each[header.index(ans+"_Hypothesis")]

        inp = ansHypo + " @@SEP@@ " + fact1
        out = fact2

        '''print (ansHypo)
        print (fact1)
        print (fact2)
        print ('1',WordTokenizer().tokenize(ansHypo))
        print ('2',WordTokenizer().tokenize(fact1))
        print ('3',WordTokenizer().tokenize(fact2))
        '''
        inputTokens = [i.text for i in WordTokenizer().tokenize(ansHypo)+WordTokenizer().tokenize(fact1)]
        outputTokens = WordTokenizer().tokenize(fact2)
        strOutput = ""
        count = 0
        for word in outputTokens:
            if word.text in inputTokens:
                strOutput += word.text+" "
                count+=1

        if count < 2:
            continue

                                                
        fact2ProcessedTokens.append(count)

        #print ('4',strOutput.strip())
        #input("WAITYY")
        out = strOutput.strip()
        #print (id,"***",fact1,"***",fact2,"***",ans,"***",ansHypo,each[header.index('A_Hypothesis')],"***",each[header.index('B_Hypothesis')],"***",each[header.index('C_Hypothesis')],"***",each[header.index('D_Hypothesis')])

        writer.writerow([inp,out])
    tsvFile.close()

    #---------- print data stats ---------
    print ("Average tokens per data in file :",tsvFileName,"=",sum(fact2ProcessedTokens)/len(fact2ProcessedTokens))

    return tsvFileName
Exemplo n.º 2
0
    def __init__(self,
                 source_tokenizer: Tokenizer = None,
                 target_tokenizer: Tokenizer = None,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 target_token_indexers: Dict[str, TokenIndexer] = None,
                 add_start_end_token: bool = True,
                 sentence_context_window: int = 2,
                 sentence_predictive_window: int = 1,
                 target_negative: bool = True,
                 dataset_path: str = "./dataset-cache/",
                 use_existing_cached_db: bool = True,
                 db_discriminator="def",
                 save_sentiment: bool = True,
                 sentiment_features: bool = True,
                 ner_model: str = None,
                 coreference_model: str = None,
                 min_story_sentences: int = 0,
                 max_story_sentences: int = 10 * 6,
                 positional_features: bool = True,
                 truncate_sequence_length: int = 200,
                 story_embedding: bool = False,
                 named_entity_embeddings: bool = False,
                 story_chunking: int = 100,
                 interleave_story_sentences: bool = False,
                 cuda_device: Union[List[int], int] = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._source_tokenizer = source_tokenizer or WordTokenizer()
        self._target_tokenizer = target_tokenizer or self._source_tokenizer
        self._source_token_indexers = source_token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._target_token_indexers = target_token_indexers or self._source_token_indexers
        self._add_start_end_token = add_start_end_token
        self._sentence_context_window = sentence_context_window
        self._sentence_predictive_window = sentence_predictive_window
        self._target_negative: bool = target_negative
        self._dataset_path = dataset_path
        self._use_existing_cached_db = use_existing_cached_db
        self._db_discriminator = db_discriminator
        self._save_sentiment = save_sentiment
        self._sentiment_features = sentiment_features
        self._ner_model = ner_model
        self._coreference_model = coreference_model
        self._min_story_sentences = min_story_sentences
        self._max_story_sentences = max_story_sentences
        self._positional_features = positional_features
        self._truncate_sequence_length = truncate_sequence_length
        self._truncate_sequences = (truncate_sequence_length != 0)
        self._story_embedding = story_embedding
        self._named_entity_embeddings = named_entity_embeddings
        self._story_chunking = story_chunking
        self._interleave_story_sentences = interleave_story_sentences

        # For now just use a default indexer. In future look to cluster and reuse.
        self._story_token_indexer = SingleIdTokenIndexer(namespace="story")
        self._entity_token_indexer = SingleIdTokenIndexer(
            namespace="coreferences")

        self._cuda_device = cuda_device
Exemplo n.º 3
0
 def __init__(
     self,
     source_tokenizer: Tokenizer = None,
     target_tokenizer: Tokenizer = None,
     source_token_indexers: Dict[str, TokenIndexer] = None,
     target_token_indexers: Dict[str, TokenIndexer] = None,
     source_add_start_token: bool = True,
     delimiter: str = "\t",
     source_max_tokens: Optional[int] = 256,
     target_max_tokens: Optional[int] = 32,
     lazy: bool = False,
 ) -> None:
     super().__init__(lazy)
     self._source_tokenizer = source_tokenizer or WordTokenizer(
         word_splitter=JustSpacesWordSplitter())
     self._target_tokenizer = target_tokenizer or self._source_tokenizer
     self._source_token_indexers = source_token_indexers
     self._target_token_indexers = target_token_indexers or self._source_token_indexers
     self._source_add_start_token = source_add_start_token
     self._delimiter = delimiter
     self._source_max_tokens = source_max_tokens
     self._target_max_tokens = target_max_tokens
     self._source_max_exceeded = 0
     self._target_max_exceeded = 0
     self.pre_sen = 10
     self.seg = pkuseg.pkuseg(model_name='medicine',
                              user_dict='../data/0510/mdg/user_dict.txt')
Exemplo n.º 4
0
 def __init__(
     self,
     source_tokenizer: Tokenizer = None,
     target_tokenizer: Tokenizer = None,
     source_token_indexers: Dict[str, TokenIndexer] = None,
     target_token_indexers: Dict[str, TokenIndexer] = None,
     source_add_start_token: bool = True,
     delimiter: str = "\t",
     source_max_tokens: Optional[int] = 510,
     target_max_tokens: Optional[int] = 64,
     lazy: bool = False,
 ) -> None:
     super().__init__(lazy)
     self._source_tokenizer = source_tokenizer or WordTokenizer(
         word_splitter=JustSpacesWordSplitter())
     self._target_tokenizer = target_tokenizer or self._source_tokenizer
     self._source_token_indexers = source_token_indexers
     self._target_token_indexers = target_token_indexers or self._source_token_indexers
     self._source_add_start_token = source_add_start_token
     self._delimiter = delimiter
     self._source_max_tokens = source_max_tokens
     self._target_max_tokens = target_max_tokens
     self._source_max_exceeded = 0
     self._target_max_exceeded = 0
     self.pre_sen = 10
Exemplo n.º 5
0
 def __init__(self,
              min_combination_num: int = 3,
              max_combination_num: int = 5,
              rm_stop_word: bool = True,
              synonyms: bool = True,
              stem: bool = False,
              tokenization: bool = True,
              beam_sz: int = 5,
              candidate_percent: float = 1.0):
     self.min_combination_num = min_combination_num
     self.max_combination_num = max_combination_num
     self.rm_stop_word = rm_stop_word
     self.stem = stem
     self.tokenization = tokenization
     self.beam_sz = beam_sz
     self.candidate_percent = candidate_percent
     if self.stem:
         self.stemmer = PorterStemmer().stem_word
     else:
         self.stemmer = lambda x: x
     self.synonyms = synonyms
     if self.tokenization:
         from allennlp.data.tokenizers.word_tokenizer import WordTokenizer
         self.tokenizer = WordTokenizer()
     if self.rm_stop_word:
         self.stop_words = list(set(stopwords.words('english'))) + [x for x in string.punctuation] + ['``', '\'\'']
     else:
         self.stop_words = []
Exemplo n.º 6
0
 def test_passes_through_correctly(self):
     word_processor = WordTokenizer()
     sentence = "this (sentence) has 'crazy' \"punctuation\"."
     tokens = word_processor.tokenize(sentence)
     expected_tokens = [
         "this", "(", "sentence", ")", "has", "'", "crazy", "'", "\"",
         "punctuation", "\"", "."
     ]
     assert tokens == expected_tokens
Exemplo n.º 7
0
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 model: str = None) -> None:

        super().__init__(lazy=False)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers
        self._model = model
 def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     """
     Expects JSON that looks like ``{"sentence": "..."}``.
     Runs the underlying model, and adds the ``"label"`` to the output.
     """
     sentence = json_dict["sentence"]
     if not hasattr(self._dataset_reader, "tokenizer") and not hasattr(
             self._dataset_reader, "_tokenizer"):
         tokenizer = WordTokenizer()
         sentence = [str(t) for t in tokenizer.tokenize(sentence)]
     return self._dataset_reader.text_to_instance(sentence)
Exemplo n.º 9
0
def main():
    args = parse_args()
    checkpoint_path = Path(args.checkpoint)
    checkpoint_dir = checkpoint_path.parent
    params_path = checkpoint_dir / 'params.json'
    vocab_dir = checkpoint_dir / 'vocab'

    params = Params.from_file(params_path)
    train_params, model_params = params.pop('train'), params.pop('model')

    tokenizer = WordTokenizer(
        start_tokens=['<s>'],
        end_tokens=['</s>'],
    )
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(tokenizer=tokenizer,
                                token_indexers={'tokens': token_indexer})

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))
    if not args.test_dataset:
        test_dataset_path = train_params.pop('test_dataset_path')
    else:
        test_dataset_path = args.test_dataset
    test_dataset = dataset_reader.read(test_dataset_path)
    if args.only_label:
        test_dataset = [
            d for d in test_dataset
            if d.fields['label'].label == args.only_label
        ]
    vocab = Vocabulary.from_files(vocab_dir)
    random.shuffle(valid_dataset)

    model_params['token_embedder']['pretrained_file'] = None
    model = SNLIModel(params=model_params, vocab=vocab)
    model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'),
                          strict=False)
    model.to(args.cuda_device)
    model.eval()

    torch.set_grad_enabled(False)

    iterator = BasicIterator(batch_size=32)
    iterator.index_with(vocab)

    for dataset in (valid_dataset, test_dataset):
        generator = iterator(dataset, shuffle=False, num_epochs=1)
        model.get_metrics(reset=True)
        for batch in tqdm(generator):
            batch = move_to_device(batch, cuda_device=args.cuda_device)
            model(premise=batch['premise'],
                  hypothesis=batch['hypothesis'],
                  label=batch['label'])
        metrics = model.get_metrics()
        pprint(metrics)
def main():
    args = parse_args()
    checkpoint_path = Path(args.checkpoint)
    checkpoint_dir = checkpoint_path.parent
    params_path = checkpoint_dir / 'params.json'
    vocab_dir = checkpoint_dir / 'vocab'

    params = Params.from_file(params_path)
    train_params, model_params = params.pop('train'), params.pop('model')

    tokenizer = WordTokenizer(
        start_tokens=['<s>'],
        end_tokens=['</s>'],
    )
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(tokenizer=tokenizer,
                                token_indexers={'tokens': token_indexer})

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))
    vocab = Vocabulary.from_files(vocab_dir)
    random.shuffle(valid_dataset)

    model_params['token_embedder']['pretrained_file'] = None
    model = SNLIModel(params=model_params, vocab=vocab)
    model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'),
                          strict=False)
    model.eval()

    iterator = BasicIterator(batch_size=1)
    iterator.index_with(vocab)
    generator = iterator(valid_dataset)

    for i in range(10):
        batch = next(generator)
        label_token_to_index = vocab.get_token_to_index_vocabulary('labels')
        print('----')
        print(' '.join(
            model.convert_to_readable_text(batch['premise']['tokens'])[0]))
        for label, label_index in label_token_to_index.items():
            label_tensor = torch.tensor([label_index])
            enc_embs = model.embed(batch['premise']['tokens'])
            enc_mask = get_text_field_mask(batch['premise'])
            enc_hidden = model.encode(inputs=enc_embs,
                                      mask=enc_mask,
                                      drop_start_token=True)
            code, kld = model.sample_code_and_compute_kld(enc_hidden)
            generated = model.generate(code=code,
                                       label=label_tensor,
                                       max_length=enc_mask.sum(1) * 2,
                                       beam_size=10,
                                       lp_alpha=args.lp_alpha)
            text = model.convert_to_readable_text(generated[:, 0])[0]
            print(label)
            print(' '.join(text))
Exemplo n.º 11
0
def main():
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=CharacterTokenizer(),
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})
    train_dataset = reader.read('data/mt/tatoeba.eng_cmn.train.tsv')
    validation_dataset = reader.read('data/mt/tatoeba.eng_cmn.dev.tsv')

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                      min_count={'tokens': 3, 'target_tokens': 3})

    en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=EN_EMBEDDING_DIM)
    # encoder = PytorchSeq2SeqWrapper(
    #     torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

    # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
    # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
    attention = DotProductAttention()

    max_decoding_steps = 20   # TODO: make this variable
    model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=1,
                      cuda_device=CUDA_DEVICE)

    for i in range(50):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
Exemplo n.º 12
0
def main():
    args = parse_args()
    checkpoint_path = Path(args.checkpoint)
    checkpoint_dir = checkpoint_path.parent
    params_path = checkpoint_dir / 'params.json'
    vocab_dir = checkpoint_dir / 'vocab'

    params = Params.from_file(params_path)
    train_params, model_params = params.pop('train'), params.pop('model')

    tokenizer = WordTokenizer(start_tokens=['<s>'], end_tokens=['</s>'],)
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(
        tokenizer=tokenizer, token_indexers={'tokens': token_indexer})

    valid_dataset = dataset_reader.read(
        train_params.pop('valid_dataset_path'))
    vocab = Vocabulary.from_files(vocab_dir)

    model_params['token_embedder']['pretrained_file'] = None
    model = SNLIModel(params=model_params, vocab=vocab)
    model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'),
                          strict=False)
    model.to(args.device)
    model.eval()

    iterator = BasicIterator(batch_size=args.batch_size)
    iterator.index_with(vocab)
    generator = iterator(valid_dataset, num_epochs=1, shuffle=False)
    label_index_to_token = vocab.get_index_to_token_vocabulary('labels')

    out_file = open(args.out, 'w')

    for batch in tqdm(generator):
        premise_tokens = batch['premise']['tokens']
        enc_embs = model.embed(premise_tokens.to(args.device))
        enc_mask = get_text_field_mask(batch['premise']).to(args.device)
        enc_hidden = model.encode(inputs=enc_embs, mask=enc_mask,
                                  drop_start_token=True)
        code, kld = model.sample_code_and_compute_kld(enc_hidden)
        pre_text = model.convert_to_readable_text(premise_tokens[:, 1:])
        label_tensor = batch['label'].to(args.device)
        generated = model.generate(
            code=code, label=label_tensor, max_length=25,
            beam_size=10, lp_alpha=args.lp_alpha)
        text = model.convert_to_readable_text(generated[:, 0])
        for pre_text_b, text_b, label_index_b in zip(pre_text, text, label_tensor):
            obj = {'sentence1': ' '.join(pre_text_b), 'sentence2': ' '.join(text_b),
                   'gold_label': label_index_to_token[label_index_b.item()]}
            out_file.write(json.dumps(obj))
            out_file.write('\n')
Exemplo n.º 13
0
    def _read(self, file_path: str) -> Iterator[Instance]:
        splitter = SpacyWordSplitter('en_core_web_sm', True, True, True)
        tokenizer = WordTokenizer(word_splitter=splitter)
        root = ElementTree.parse(file_path).getroot()
        xml_sents = root.findall("./sentence")

        for xml_sent in tqdm(xml_sents):
            text = xml_sent.find("text").text
            annotations = xml_sent.find('aspectTerms')
            if annotations is not None:
                annotations = annotations.findall("aspectTerm")
            else:
                annotations = []

            # Sorts the annotations by start character
            annotations.sort(key=lambda x: int(x.get('from')))

            # Tokenizes the sentence
            tokens = tokenizer.tokenize(text)

            # Assigns tags based on annotations
            tags = []
            next = 0
            current = None
            for token in tokens:
                # Checks if the next annotation begins somewhere in this token
                start_entity = next < len(annotations)
                start_entity = start_entity and token.idx <= int(
                    annotations[next].get('from'))
                start_entity = start_entity and token.idx + len(
                    token.text) > int(annotations[next].get('from'))

                if start_entity:
                    tags.append('I' if current is None else 'B')
                    current = annotations[next]
                    next += 1
                elif current is not None:
                    if token.idx < int(current.get('to')):
                        tags.append('I')
                    else:
                        tags.append('O')
                        current = None
                else:
                    tags.append('O')

            yield self.text_to_instance(xml_sent.get('id'), tokens, tags)
Exemplo n.º 14
0
    def _read(self, file_path: str) -> Iterator[Instance]:

        # Keys: title + abstractText
        splitter = SpacyWordSplitter('en_core_web_sm', True, True, True)
        tokenizer = WordTokenizer(word_splitter=splitter)
        with open(file_path, 'r') as f:
            json_docs = json.load(f)

        for article in json_docs['documents']:
            doc_name = article['pmid']
            title = article['title']
            abstract = article['abstractText']
            text = title + " " + abstract

            tokens = tokenizer.tokenize(text)

            yield self.text_to_instance(doc_name, tokens)
Exemplo n.º 15
0
    def __init__(self,
                 ablate_mode: str,
                 token_indexers: Optional[Dict[str, TokenIndexer]] = None,
                 tokenizer: Optional[Tokenizer] = None,
                 limit_number: Optional[int] = None,
                 normalize_outputs: Optional[Tuple[float, float]] = None,
                 lazy: bool = False) -> None:
        super().__init__(token_indexers, tokenizer, limit_number,
                         normalize_outputs, lazy)

        assert ablate_mode in ["years", "dates", "numbers"]
        self._ablate_mode = ablate_mode

        # Ensure tokenizer creates the tags needed for filtering
        # Since we may need to use spaCy's `like_num` property that is not inherited by AlleNLP tokens, we keep the spacy tokens directly
        self._tokenizer = tokenizer or WordTokenizer(
            word_splitter=SpacyWordSplitter(
                pos_tags=True, ner=True, keep_spacy_tokens=True))
Exemplo n.º 16
0
def get_elmo_layer_representations(seq_len, text_array, remove_chars, word_ind_to_extract):

    model = ElmoEmbedder()
    tokenizer = WordTokenizer()
    
    # where to store layer-wise elmo embeddings of particular length
    elmo = {}
    for layer in range(-1,2):
        elmo[layer] = []

    if word_ind_to_extract < 0: # the index is specified from the end of the array, so invert the index
        from_start_word_ind_to_extract = seq_len + word_ind_to_extract
    else:
        from_start_word_ind_to_extract = word_ind_to_extract

    start_time = tm.time()    
        
    # before we've seen enough words to make up the sequence length, add the representation for the last word 'seq_len' times
    word_seq = text_array[:seq_len]
    for _ in range(seq_len):
        elmo = add_avrg_token_embedding_for_specific_word(word_seq,
                                                                     tokenizer,
                                                                     model,
                                                                     remove_chars,
                                                                     from_start_word_ind_to_extract,
                                                                     elmo)

    # then add the embedding of the last word in a sequence as the embedding for the sequence
    for end_curr_seq in range(seq_len, len(text_array)):
        word_seq = text_array[end_curr_seq-seq_len+1:end_curr_seq+1]
        elmo = add_avrg_token_embedding_for_specific_word(word_seq,
                                                          tokenizer,
                                                          model,
                                                          remove_chars,
                                                          from_start_word_ind_to_extract,
                                                          elmo)

        if end_curr_seq % 100 == 0:
            print('Completed {} out of {}: {}'.format(end_curr_seq, len(text_array), tm.time()-start_time))
            start_time = tm.time()

    print('Done extracting sequences of length {}'.format(seq_len))
    
    return elmo 
Exemplo n.º 17
0
    def __init__(self,
                 token_indexers: Optional[Dict[str, TokenIndexer]] = None,
                 tokenizer: Optional[Tokenizer] = None,
                 limit_number: Optional[int] = None,
                 normalize_outputs: Optional[Tuple[float, float]] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)

        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._tokenizer = tokenizer or WordTokenizer()
        self._limit_number = limit_number

        self._normalize_outputs_mean, self._normalize_outputs_std = normalize_outputs or (
            0.0, 1.0)

        self._dummy_text_field = TextField([Token("foo")],
                                           self._token_indexers)
Exemplo n.º 18
0
    def __init__(self,
                 source_tokenizer: Tokenizer = None,
                 target_tokenizer: Tokenizer = None,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 target_token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 target: bool = True,
                 label: bool = False,
                 delimiter: str = "\t",
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._source_tokenizer = source_tokenizer or WordTokenizer()
        self._target_tokenizer = target_tokenizer or self._source_tokenizer
        self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()}
        self._target_token_indexers = target_token_indexers or self._source_token_indexers
        self._source_add_start_token = source_add_start_token
        self._delimiter = delimiter

        self._target = target
        self._label = label
Exemplo n.º 19
0
from allennlp.modules.attention import LinearAttention, BilinearAttention, DotProductAttention
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper, StackedSelfAttentionEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.predictors import SimpleSeq2SeqPredictor
from allennlp.training.trainer import Trainer

EN_EMBEDDING_DIM = 256
ZH_EMBEDDING_DIM = 256
HIDDEN_DIM = 256

CUDA_DEVICE = 0

#def main():
reader = Seq2SeqDatasetReader(
    source_tokenizer=WordTokenizer(),
    target_tokenizer=CharacterTokenizer(),
    source_token_indexers={'tokens': SingleIdTokenIndexer()},
    target_token_indexers={
        'tokens': SingleIdTokenIndexer(namespace='target_tokens')
    })
train_dataset = reader.read('/.../en_el_train.txt')
validation_dataset = reader.read('/.../en_el_dev.txt')

vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                  min_count={
                                      'tokens': 3,
                                      'target_tokens': 3
                                  })

en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer
from allennlp.data.tokenizers.word_tokenizer import WordTokenizer
from allennlp.data.token_indexers import SingleIdTokenIndexer

data_reader_configs = {
    'debug': None,
    'ud-eng': None,
    'nc_zhen': {
        'source_tokenizer': WordTokenizer(),
        'target_tokenizer': CharacterTokenizer(),
        'source_token_indexers': {
            'tokens': SingleIdTokenIndexer()
        },
        'target_token_indexers': {
            'tokens': SingleIdTokenIndexer(namespace='target_tokens')
        }
    },
    'wikitext': {}
}


def get_datareader_configs(dataset_name):
    return data_reader_configs.get(dataset_name)
Exemplo n.º 21
0
def main():
    trainFile = "../srcData/trainData.csv"
    validFile = "../srcData/devData.csv"
    testFile = "../srcData/testData.csv"
    trainSeq2SeqFile = data.dataPreparation(trainFile)
    validSeq2SeqFile = data.dataPreparation(validFile)
    testSeq2SeqFile = data.dataPreparation(testFile)
    print(testSeq2SeqFile)
    # TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model
    # SingleIdTokenIndexer = Tokens are single integers
    # TokenCharactersIndexer = Tokens as a list of integers
    # Read a tsvfile with paired instances (source, target)
    reader = CopyNetDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),  # Defaults to source_tokenizer
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_namespace='tokens'  # Defaults to source_token_indexers
    )

    # Each of the dataset is a list of each tokens (source_tokens, target_tokens)
    train_dataset = reader.read(trainSeq2SeqFile)
    validation_dataset = reader.read(validSeq2SeqFile)
    test_dataset = reader.read(testSeq2SeqFile)
    """
    # Finding extra fact2 vocab
    trainExtraVocab = findExtraVocab(train_dataset)
    validExtraVocab = findExtraVocab(validation_dataset)
    testExtraVocab = findExtraVocab(test_dataset)
    finalExtraVocab = list(set(trainExtraVocab + validExtraVocab + testExtraVocab))
    print("length:", len(finalExtraVocab))
    # input()
    """
    # vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3})
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset +
                                      test_dataset)
    # Train + Valid = 9703
    # Train + Valid + Test = 10099

    print("Vocab SIze :", vocab.get_vocab_size('tokens'))

    encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=ENC_EMBEDDING_DIM)

    # Embedding for tokens since in the dataset creation time it is mentioned tokens
    source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding})

    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(ENC_EMBEDDING_DIM,
                      HIDDEN_DIM,
                      batch_first=True,
                      dropout=0.2))

    Attention = DotProductAttention()
    print(Attention)

    max_decoding_steps = 4  # TODO: make this variable

    model = CopyNetSeq2Seq(
        vocab,
        source_embedder,
        encoder,
        max_decoding_steps=max_decoding_steps,
        target_embedding_dim=TGT_EMBEDDING_DIM,
        # target_namespace = 'target_tokens',
        beam_size=beamSize,
        attention=Attention)
    # Can also specify lr=0.001
    optimizer = optim.Adam(model.parameters())

    # Data Iterator that specify how to batch our dataset
    # Takes data shuffles it and creates fixed sized batches
    # iterator = BasicIterator(batch_size=2)
    # iterator.index_with(vocab)
    # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations
    iterator = BucketIterator(batch_size=50,
                              sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=validation_dataset,
        # patience = 3,
        num_epochs=numEpochs,
        cuda_device=CUDA_DEVICE)

    trainer.train()
    """
Exemplo n.º 22
0
def main():
    args = parse_args()
    params = Params.from_file(args.params)
    save_dir = Path(args.save)
    save_dir.mkdir(parents=True)

    params.to_file(save_dir / 'params.json')

    train_params, model_params = params.pop('train'), params.pop('model')

    random_seed = train_params.pop_int('random_seed', 2019)
    torch.manual_seed(random_seed)
    random.seed(random_seed)

    log_filename = save_dir / 'stdout.log'
    sys.stdout = TeeLogger(filename=log_filename,
                           terminal=sys.stdout,
                           file_friendly_terminal_output=False)
    sys.stderr = TeeLogger(filename=log_filename,
                           terminal=sys.stderr,
                           file_friendly_terminal_output=False)

    tokenizer = WordTokenizer(
        start_tokens=['<s>'],
        end_tokens=['</s>'],
    )
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(tokenizer=tokenizer,
                                token_indexers={'tokens': token_indexer})

    train_labeled_dataset_path = train_params.pop('train_labeled_dataset_path')
    train_unlabeled_dataset_path = train_params.pop(
        'train_unlabeled_dataset_path', None)
    train_labeled_dataset = dataset_reader.read(train_labeled_dataset_path)
    train_labeled_dataset = filter_dataset_by_length(
        dataset=train_labeled_dataset, max_length=30)
    if train_unlabeled_dataset_path is not None:
        train_unlabeled_dataset = dataset_reader.read(
            train_unlabeled_dataset_path)
        train_unlabeled_dataset = filter_dataset_by_length(
            dataset=train_unlabeled_dataset, max_length=30)
    else:
        train_unlabeled_dataset = []

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))

    vocab = Vocabulary.from_instances(
        instances=train_labeled_dataset + train_unlabeled_dataset,
        max_vocab_size=train_params.pop_int('max_vocab_size', None))
    vocab.save_to_files(save_dir / 'vocab')

    labeled_batch_size = train_params.pop_int('labeled_batch_size')
    unlabeled_batch_size = train_params.pop_int('unlabeled_batch_size')
    labeled_iterator = BasicIterator(batch_size=labeled_batch_size)
    unlabeled_iterator = BasicIterator(batch_size=unlabeled_batch_size)
    labeled_iterator.index_with(vocab)
    unlabeled_iterator.index_with(vocab)

    if not train_unlabeled_dataset:
        unlabeled_iterator = None

    model = SNLIModel(params=model_params, vocab=vocab)
    optimizer = optim.Adam(params=model.parameters(),
                           lr=train_params.pop_float('lr', 1e-3))
    summary_writer = SummaryWriter(log_dir=save_dir / 'log')

    kl_anneal_rate = train_params.pop_float('kl_anneal_rate', None)
    if kl_anneal_rate is None:
        kl_weight_scheduler = None
    else:
        kl_weight_scheduler = (lambda step: min(1.0, kl_anneal_rate * step))
        model.kl_weight = 0.0

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      labeled_iterator=labeled_iterator,
                      unlabeled_iterator=unlabeled_iterator,
                      train_labeled_dataset=train_labeled_dataset,
                      train_unlabeled_dataset=train_unlabeled_dataset,
                      validation_dataset=valid_dataset,
                      summary_writer=summary_writer,
                      serialization_dir=save_dir,
                      num_epochs=train_params.pop('num_epochs', 50),
                      iters_per_epoch=len(train_labeled_dataset) //
                      labeled_batch_size,
                      write_summary_every=100,
                      validate_every=2000,
                      patience=2,
                      clip_grad_max_norm=5,
                      kl_weight_scheduler=kl_weight_scheduler,
                      cuda_device=train_params.pop_int('cuda_device', 0),
                      early_stop=train_params.pop_bool('early_stop', True))
    trainer.train()
import re
from .regex_expressions import *

from allennlp.data.tokenizers.word_tokenizer import WordTokenizer

tokenizer = WordTokenizer(end_tokens=['<EOS>'])


# TODO - extend settings and add emoji end emoticon processing
class Preprocessing(object):
    """
    Module for text pre-processing
    """
    def __init__(self, **kwargs):
        self.char_clean = kwargs.get('char_cleaning', False)
        self.char_normalize = kwargs.get('char_normalize', False)
        self.word_normalize = kwargs.get('word_normalization', False)
        self.expand = kwargs.get('expand', False)
        self.escape_punctuation = kwargs.get('escape_punctuation', False)
        self.negation = kwargs.get('negation', False)

    def split_text(self, text):
        return text.split()

    def tokenize(self, text):
        tokens = tokenizer.tokenize(text)
        return [t.text for t in tokens]

    def process_text(self, text):

        tokens = tokenizer.tokenize(text)
Exemplo n.º 24
0
def allennlp_full_tokenize(text, **tokenizer_kwargs):
    return WordTokenizer(**tokenizer_kwargs).tokenize(text)
Exemplo n.º 25
0
def main():
    args = parse_args()
    params = Params.from_file(args.params)
    save_dir = Path(args.save)
    save_dir.mkdir(parents=True)

    params.to_file(save_dir / 'params.json')

    train_params, model_params = params.pop('train'), params.pop('model')

    random_seed = train_params.pop_int('random_seed', 2019)
    torch.manual_seed(random_seed)
    random.seed(random_seed)

    log_filename = save_dir / 'stdout.log'
    sys.stdout = TeeLogger(filename=log_filename,
                           terminal=sys.stdout,
                           file_friendly_terminal_output=False)
    sys.stderr = TeeLogger(filename=log_filename,
                           terminal=sys.stderr,
                           file_friendly_terminal_output=False)

    tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter(),
                              start_tokens=['<s>'],
                              end_tokens=['</s>'])
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = QuoraParaphraseDatasetReader(
        tokenizer=tokenizer, token_indexers={'tokens': token_indexer})

    train_labeled_dataset_path = train_params.pop('train_labeled_dataset_path')
    train_unlabeled_dataset_path = train_params.pop(
        'train_unlabeled_dataset_path', None)
    train_labeled_dataset = dataset_reader.read(train_labeled_dataset_path)
    train_labeled_dataset = filter_dataset_by_length(
        dataset=train_labeled_dataset, max_length=35)
    if train_unlabeled_dataset_path is not None:
        train_unlabeled_dataset = dataset_reader.read(
            train_unlabeled_dataset_path)
        train_unlabeled_dataset = filter_dataset_by_length(
            dataset=train_unlabeled_dataset, max_length=35)
    else:
        train_unlabeled_dataset = []

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))

    vocab = Vocabulary.from_instances(
        instances=train_labeled_dataset + train_unlabeled_dataset,
        max_vocab_size=train_params.pop_int('max_vocab_size', None))
    vocab.save_to_files(save_dir / 'vocab')

    labeled_batch_size = train_params.pop_int('labeled_batch_size')
    unlabeled_batch_size = train_params.pop_int('unlabeled_batch_size')
    labeled_iterator = BasicIterator(batch_size=labeled_batch_size)
    unlabeled_iterator = BasicIterator(batch_size=unlabeled_batch_size)
    labeled_iterator.index_with(vocab)
    unlabeled_iterator.index_with(vocab)

    if not train_unlabeled_dataset:
        unlabeled_iterator = None

    model = SeparatedQuoraModel(params=model_params, vocab=vocab)
    optimizer = optim.Adam(params=model.parameters())
    summary_writer = SummaryWriter(log_dir=save_dir / 'log')

    trainer = SeparatedLVMTrainer(
        model=model,
        optimizer=optimizer,
        labeled_iterator=labeled_iterator,
        unlabeled_iterator=unlabeled_iterator,
        train_labeled_dataset=train_labeled_dataset,
        train_unlabeled_dataset=train_unlabeled_dataset,
        validation_dataset=valid_dataset,
        summary_writer=summary_writer,
        serialization_dir=save_dir,
        num_epochs=train_params.pop('num_epochs', 50),
        iters_per_epoch=len(train_labeled_dataset) // labeled_batch_size,
        write_summary_every=100,
        validate_every=2000,
        patience=train_params.pop('patience', 2),
        clip_grad_max_norm=5,
        cuda_device=train_params.pop_int('cuda_device', 0))
    trainer.train()
Exemplo n.º 26
0
from knu_ci.my_logger import Logger
from knu_ci.utils import conf, BASE_DIR

logger = Logger(__name__).get_logger()

config = conf['seq2seq_allen']
train_file = config['train_data']
valid_file = config['valid_data']
src_embedding_dim = config['src_embedding_dim']
trg_embedding_dim = config['trg_embedding_dim']
hidden_dim = config['hidden_dim']

cuda_device = 1 if torch.cuda.is_available() else 0

reader = Seq2SeqDatasetReader(
    source_tokenizer=WordTokenizer(),
    target_tokenizer=WordTokenizer(),
    source_token_indexers={'tokens': SingleIdTokenIndexer()},
    target_token_indexers={
        'tokens': SingleIdTokenIndexer(namespace='target_tokens')
    })

train_dataset = reader.read(os.path.join(BASE_DIR, train_file))
valid_dataset = reader.read(os.path.join(BASE_DIR, valid_file))

vocab = Vocabulary.from_instances(train_dataset + valid_dataset,
                                  min_count={
                                      'tokens': 3,
                                      'target_tokens': 3
                                  })
Exemplo n.º 27
0
 def __init__(self, sentence_splitter):
     self._sentence_splitter = sentence_splitter
     self._word_tokenizer = WordTokenizer()
Exemplo n.º 28
0
async def save_coreferences(coreference_model: Model, dataset_db: str, cuda_device: Union[List[int], int] = None,
                            save_batch_size: int = 25, sentence_chunks: int = 200):
    with dataset.connect(dataset_db, engine_kwargs=engine_kwargs) as db:

        coref_table = db.create_table('coreference')
        coref_table.create_column('story_id', db.types.bigint)
        coref_table.create_column('coref_id', db.types.integer)
        coref_table.create_column('start_span', db.types.integer)
        coref_table.create_column('end_span', db.types.integer)
        coref_table.create_column('mention_text', db.types.string)
        coref_table.create_column('context_text', db.types.string)
        coref_table.create_index(['story_id'])
        coref_table.create_index(['start_span'])
        coref_table.create_index(['end_span'])

        gpu_max_workers = 1

        if isinstance(cuda_device, (list, tuple)):
            gpu_max_workers = len(cuda_device)
            gpus = cuda_device
        else:
            gpus = [cuda_device]

        word_tokenizer = WordTokenizer()

        loop = asyncio.get_event_loop()

        with ThreadPoolExecutor(max_workers=gpu_max_workers) as executor:

            processors = []
            for gpu in gpus:
                processors.append(CoreferenceProcessor(coreference_model, dataset_db, cuda_device=gpu))
            processors_cycle = itertools.cycle(processors)

            tasks = []
            # Order by shortest to longest so possible failures are at the end.
            for story in db['story'].find(order_by=['sentence_num', 'id']):

                sentence_list = [s["text"] for s in db["sentence"].find(story_id=story["id"], order_by='id')]
                sentence_tokens = word_tokenizer.batch_tokenize(sentence_list)

                for sentence_chunk in more_itertools.chunked(sentence_tokens, n=sentence_chunks):
                    sentence_chunk_flat = list(more_itertools.flatten(sentence_chunk))

                    if len(sentence_chunk_flat) < 10:
                        continue

                    sentence_chunk_text = [t.text for t in sentence_chunk_flat]

                    tasks.append(loop.run_in_executor(executor, next(processors_cycle), sentence_chunk_text, story["id"]))

                    if len(tasks) == save_batch_size:
                        results = await asyncio.gather(*tasks)

                        for coref_to_save in results:
                            try:

                                db["coreference"].insert_many(copy.deepcopy(coref_to_save))

                            except Exception as e:
                                logging.error(e)

                        tasks = []

            results = await asyncio.gather(*tasks)

            for coref_to_save in results:
                try:

                    db["coreference"].insert_many(coref_to_save)

                except Exception as e:
                    logging.error(e)

            logger.info(f"Coreferences Saved")
Exemplo n.º 29
0
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 add_start_end_token: bool = False,
                 dataset_path: str = "./dataset-cache/",
                 use_existing_cached_db: bool = True,
                 db_discriminator="def",
                 min_story_sentences: int = 5,
                 max_story_sentences: int = 500,
                 min_sequence_length: int = 2,
                 max_sequence_length: int = 50,
                 max_avg_length_per_word=8,
                 max_word_length=25,
                 min_check_word_length=8,
                 story_chunking: int = 50,
                 ner_model: str = None,
                 coreference_model: str = None,
                 marked_sentences: bool = False,
                 cuda_device: Union[List[int], int] = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }

        self._add_start_end_token = add_start_end_token

        self._dataset_path = dataset_path
        self._use_existing_cached_db = use_existing_cached_db
        self._db_discriminator = db_discriminator
        self._min_story_sentences = min_story_sentences
        self._max_story_sentences = max_story_sentences
        self._max_sequence_length = max_sequence_length
        self._min_sequence_length = min_sequence_length
        self._max_character_length = self._max_sequence_length * max_avg_length_per_word
        self._max_word_length = max_word_length
        self._min_check_word_length = min_check_word_length
        self._truncate_sequences = (max_sequence_length != 0)
        self._story_chunking = story_chunking
        self._ner_model = ner_model
        self._coreference_model = coreference_model
        self._marked_sentences = marked_sentences

        if "DATASET_PATH" in os.environ:
            self._dataset_path = os.getenv('DATASET_PATH')

        self._cuda_device = cuda_device

        self._allowed_tokens = set([])
        self._tried_tokens = set([])

        for t in nltk.corpus.stopwords.words('english'):
            self._allowed_tokens.add(t)
            self._tried_tokens.add(t)

        for t in STOP_WORDS:
            self._allowed_tokens.add(t)
            self._tried_tokens.add(t)

        for t in punctuation:
            self._allowed_tokens.add(t)
            self._tried_tokens.add(t)

        self._py_dictionary = PyDictionary()

        self._seen_datasets = set()

        self._tried_to_insert = []
        self._allowed_to_insert = []
Exemplo n.º 30
0
def main():
    elmo_token_indexer = ELMoTokenCharactersIndexer()

    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),
        source_token_indexers={'tokens': elmo_token_indexer},
        target_token_indexers={
            'tokens': SingleIdTokenIndexer(namespace='target_tokens')
        })

    train_dataset, test_dataset, dev_dataset = (
        reader.read(DATA_ROOT + "/" + fname) for fname in
        ["train_all_seq.txt", "test_all_seq.txt", "val_all_seq.txt"])

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset +
                                      test_dataset,
                                      min_count={
                                          'tokens': 1,
                                          'target_tokens': 1
                                      })

    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    #                              embedding_dim=256)
    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    # embedding_dim=elmo_embedding_dim)
    #elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5)
    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    # word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder})
    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    # embedding_dim=256)
    source_embedder = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    #Initializing the model
    max_decoding_steps = 20
    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True))

    # encoder = StackedSelfAttentionEncoder(input_dim=elmo_embedding_dim, hidden_dim=hidden_dim, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)
    attention = DotProductAttention()

    model = SimpleSeq2Seq(vocab,
                          source_embedder,
                          encoder,
                          max_decoding_steps,
                          target_embedding_dim=elmo_embedding_dim,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)

    if USE_GPU: model.cuda()
    else: model

    # Training the model
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=1,
                      cuda_device=0 if USE_GPU else -1)

    for i in range(20):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(dev_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:',
                  predictor.predict_instance(instance)['predicted_tokens'])

    #Saving the model
    with open("model_seq2seq.th", 'wb') as f:
        torch.save(model.state_dict(), f)

    vocab.save_to_files("vocabulary_seq2seq")
    predictor = SimpleSeq2SeqPredictor(model, reader)
    with open('predict_seq2seq.txt', 'w+') as f:
        for instance in itertools.islice(test_dataset, 10):
            preds = predictor.predict_instance(instance)['predicted_tokens']
            f.write(" ".join(preds) + "\n")