def generate_answers_attention(session, model, word2id, qn_uuid_data, context_token_data, qn_token_data):
    """
    Given a model, and a set of (context, question) pairs, each with a unique ID,
    use the model to generate an answer for each pair, and return a dictionary mapping
    each unique ID to the generated answer + prob which is product of start and end prob.

    Inputs:
      session: TensorFlow session
      model: QAModel
      word2id: dictionary mapping word (string) to word id (int)
      qn_uuid_data, context_token_data, qn_token_data: lists

    Outputs:
      uuid2ans: dictionary mapping uuid (string) to predicted answer (string; detokenized)
    """
    uuid2ans = {} # maps uuid to string containing predicted answer
    data_size = len(qn_uuid_data)
    num_batches = ((data_size-1) / model.FLAGS.batch_size) + 1
    batch_num = 0
    detokenizer = MosesDetokenizer()

    print("Generating answers...")

    for batch in get_batch_generator(word2id, qn_uuid_data, context_token_data, qn_token_data, model.FLAGS.batch_size, model.FLAGS.context_len, model.FLAGS.question_len):

        # Get the predicted spans
        pred_start_batch, pred_end_batch, maxprob_batch = model.get_start_end_pos(session, batch)
        attn_distribution = model.get_attention_dist(session, batch)

        # Convert pred_start_batch and pred_end_batch to lists length batch_size
        pred_start_batch = pred_start_batch.tolist()
        pred_end_batch = pred_end_batch.tolist()
        maxprob_batch = maxprob_batch.tolist()
        attn_distribution_batch = attn_distribution.tolist()

        # For each example in the batch:
        for ex_idx, (pred_start, pred_end, maxprob, attn_dist) in enumerate(zip(pred_start_batch, pred_end_batch, maxprob_batch, attn_distribution_batch)):

            # Original context tokens (no UNKs or padding) for this example
            context_tokens = batch.context_tokens[ex_idx] # list of strings

            # Check the predicted span is in range
            assert pred_start in range(len(context_tokens))
            assert pred_end in range(len(context_tokens))

            # Predicted answer tokens
            pred_ans_tokens = context_tokens[pred_start : pred_end +1] # list of strings

            # Detokenize and add to dict
            uuid = batch.uuids[ex_idx]
            uuid2ans[uuid] = [detokenizer.detokenize(pred_ans_tokens, return_str=True), maxprob, attn_dist]

        batch_num += 1

        if batch_num % 10 == 0:
            print("Generated answers for %i/%i batches = %.2f%%" % (batch_num, num_batches, batch_num*100.0/num_batches))

    print("Finished generating answers for dataset.")

    return uuid2ans
def parsedToDB(data, cid, catID, fileid):
    file = document.objects.get(id=fileid)
    fileName = file.file_name
    fileName = fileName.split(".")[0]
    tempName = re.split('(\d+)', fileName)
    fileName = ""
    for part in tempName:
        fileName += (' ' + part)

    detokenizer = MosesDetokenizer()
    regex = re.compile('[^a-zA-Z0-9 \n\.]')
    data = data.replace('\x00', ' ')
    dbAnswer = data
    data = fileName + ' ' + data
    data = data.replace('<br>', ' ')
    data = regex.sub('', data)
    data_list = nltk.word_tokenize(data)
    data = [
        word for word in data_list if word not in stopwords.words('english')
    ]
    detokenizer.detokenize(data, return_str=True)
    dbInfo = " ".join(data).lower()
    botanswers.objects.create(answer=dbAnswer,
                              rating=0,
                              category_id=catID.id,
                              entities=dbInfo,
                              course_id=cid,
                              file_id=fileid)
예제 #3
0
def detokenize(line):
    tokens = line.replace(" n't", "n't").split(' ')
    tokens = list(map(lambda x: map_brackets_bw(x), tokens))
    detokenizer = MosesDetokenizer()
    res = detokenizer.detokenize(tokens, return_str=True)
    res = res[0].upper() + res[1:]
    return res
예제 #4
0
def clean_text(raw_text, get_questions=False):
    """
    Words consist of letters or numbers
    :param raw_text: text (not divided into sentences)
    :return: list of sanitized sentences
    """
    # Tokenize text into sentences.
    raw_text = delete_parenthesis(raw_text)

    sentences = nltk.sent_tokenize(raw_text)

    #Tokenize each sentence
    sanitized_sentences = []
    for s in sentences:
        #use Moses instead of nltk.word_tokenize(s)  - better with apostrophes: cant -> (can + 't) but not (ca + 'n't)
        tokenizer = MosesTokenizer()
        s_tokens = tokenizer.tokenize(s)
        #s_tokens = nltk.word_tokenize(s)
        if (not get_questions
                and s_tokens[-1] != '?') or (get_questions
                                             and s_tokens[-1] == '?'):
            sanitized_sentences.append(sanitize(s_tokens))

    #Sanitized tokens joined using detokenizer
    detokenizer = MosesDetokenizer()
    return [
        detokenizer.detokenize(s, return_str=True) for s in sanitized_sentences
    ]
예제 #5
0
def generate_answers_with_start_end(model_flags, word2id, char2id, qn_uuid_data, 
                     context_token_data, qn_token_data, pred_start_batches, pred_end_batches):
    """
    Given a model, and a set of (context, question) pairs, each with a unique ID,
    use the model to generate an answer for each pair, and return a dictionary mapping
    each unique ID to the generated answer.

    Inputs:
      model_flags: QAModel flags, batch size, must be the same for all models.
      word2id: dictionary mapping word (string) to word id (int)
      qn_uuid_data, context_token_data, qn_token_data: lists
      pred_start_batches, pred_end_batches: list of list, size is model_flags.batch_size

    Outputs:
      uuid2ans: dictionary mapping uuid (string) to predicted answer (string; detokenized)
    """
    uuid2ans = {} # maps uuid to string containing predicted answer
    data_size = len(qn_uuid_data)
    num_batches = ((data_size-1) / model_flags.batch_size) + 1
    batch_num = 0
    detokenizer = MosesDetokenizer()
    
    print "Generating %d answers..." % len(qn_uuid_data)
    for batch in get_batch_generator(word2id, char2id, qn_uuid_data, context_token_data, 
                                     qn_token_data, model_flags.batch_size, 
                                     model_flags.context_len, model_flags.question_len, model_flags.word_len):

        # Get the predicted spans
        pred_start_batch = pred_start_batches[batch_num]
        pred_end_batch = pred_end_batches[batch_num]

        # Convert pred_start_batch and pred_end_batch to lists length batch_size
        pred_start_batch = pred_start_batch.tolist()
        pred_end_batch = pred_end_batch.tolist()
        
        # For each example in the batch:
        for ex_idx, (pred_start, pred_end) in enumerate(zip(pred_start_batch, pred_end_batch)):

            # Original context tokens (no UNKs or padding) for this example
            context_tokens = batch.context_tokens[ex_idx] # list of strings

            # Check the predicted span is in range
            assert pred_start in range(len(context_tokens))
            assert pred_end in range(len(context_tokens))

            # Predicted answer tokens
            pred_ans_tokens = context_tokens[pred_start : pred_end +1] # list of strings

            # Detokenize and add to dict
            uuid = batch.uuids[ex_idx]
            uuid2ans[uuid] = detokenizer.detokenize(pred_ans_tokens, return_str=True)
            
        batch_num += 1

        if batch_num % 10 == 0:
            print "Generated answers for %i/%i batches = %.2f%%" % (batch_num, num_batches, batch_num*100.0/num_batches)

    print "Finished generating answers for dataset."

    return uuid2ans
예제 #6
0
def run():
  detokenizer = MosesDetokenizer() 
  with open('data.json') as f:
    data = json.load(f)
  word2vec = data['word2vec']
  contexts = data['contexts']
  questions = data['questions']
  predictions = []
  for c,qs in tqdm(zip(contexts, questions), total=len(contexts)):
    if len(c) == 1:
      continue
    # Get vector embedding of context
    ce = []
    for sent in c:
      ct = sent_embed(sent,word2vec)
      ce.append(ct)

    # Get vector embedding of sentence
    # Find the most similar sentence in the context
    for q in qs:
      qe = sent_embed(q,word2vec)
      sims = [cosine_similarity(qe, cs) for cs in ce]
      max_sim = max(sims)
      idx = sims.index(max_sim)
      predictions.append(detokenizer.detokenize(c[idx], return_str=True))
  return predictions 
예제 #7
0
 def __init__(self, endings, stop_phrases, only_bulleted_lines=True, confidence=95,  *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.endings = endings
     self.stop_phrases = stop_phrases
     self.only_bulleted_lines = only_bulleted_lines
     self.detokenizer = MosesDetokenizer()
     self.confidence = confidence
예제 #8
0
def detokenize(line):
    tokens = line.replace(" n't", "n't").split(' ')
    tokens = list(map(lambda x: map_brackets_bw(x), tokens))
    detokenizer = MosesDetokenizer()
    res = detokenizer.detokenize(tokens, return_str=True)
    res = res[0].upper() + res[1:]
    return res
예제 #9
0
 def __init__(self):
     try:
         from sacremoses import MosesDetokenizer
         self._detokenizer = MosesDetokenizer()
     except (ImportError, TypeError) as err:
         if isinstance(err, TypeError):
             warnings.warn('The instantiation of MosesDetokenizer in sacremoses is'
                           ' currently only supported in python3.'
                           ' Now try NLTKMosesDetokenizer using NLTK ...')
         else:
             warnings.warn('sacremoses is not installed. '
                           'To install sacremoses, use pip install -U sacremoses'
                           ' Now try NLTKMosesDetokenizer using NLTK ...')
         try:
             import nltk
             try:
                 nltk.data.find('perluniprops')
             except LookupError:
                 nltk.download('perluniprops')
             from nltk.tokenize.moses import MosesDetokenizer
             self._detokenizer = MosesDetokenizer()
         except ImportError:
             raise ImportError('NLTK is not installed. '
                               'You must install NLTK <= 3.2.5 in order to use the '
                               'NLTKMosesDetokenizer. You can refer to the official '
                               'installation guide in https://www.nltk.org/install.html .')
예제 #10
0
def rm_addresses(doc):
    doc_l = []
    rm_st = []
    sens = doc.split("\n")
    for sent in sens:
        sent_l = []
        usaddress.parse(sent)
        for tuple2 in usaddress.parse(sent):
            # print(tuple2)
            if tuple2[1] == 'BuildingName' or tuple2[
                    1] == 'Recipient' or tuple2[1] == 'OccupancyType' or tuple2[
                        1] == 'OccupancyIdentifier' or tuple2[
                            1] == 'LandmarkName':
                sent_l.append(tuple2[0])
            else:
                sent_l.append("█" * len(tuple2[0]))
                rm_st.append(tuple2[0])
        # print(sent_l)
        deto = MosesDetokenizer()
        sent_n = deto.detokenize(sent_l, return_str=True)
        # sent_n = " ".join(sent_l)
        doc_l.append(sent_n)
    # print(doc_l)
    doc = "\n".join(doc_l)
    #print(doc)
    return doc, rm_st
예제 #11
0
 def __init__(self):
     try:
         from sacremoses import MosesDetokenizer
         self._detokenizer = MosesDetokenizer()
     except (ImportError, TypeError) as err:
         if isinstance(err, TypeError):
             warnings.warn('The instantiation of MosesDetokenizer in sacremoses is'
                           ' currently only supported in python3.'
                           ' Now try NLTKMosesDetokenizer using NLTK ...')
         else:
             warnings.warn('sacremoses is not installed. '
                           'To install sacremoses, use pip install -U sacremoses'
                           ' Now try NLTKMosesDetokenizer using NLTK ...')
         try:
             import nltk
             try:
                 nltk.data.find('perluniprops')
             except LookupError:
                 nltk.download('perluniprops')
             from nltk.tokenize.moses import MosesDetokenizer
             self._detokenizer = MosesDetokenizer()
         except ImportError:
             raise ImportError('NLTK is not installed. '
                               'You must install NLTK <= 3.2.5 in order to use the '
                               'NLTKMosesDetokenizer. You can refer to the official '
                               'installation guide in https://www.nltk.org/install.html .')
예제 #12
0
def spellCheck(s):
    tokens = nltk.word_tokenize(cleaner(s))
    corrected = [
        spell(s) if s not in string.punctuation else s for s in tokens
    ]
    mose = MosesDetokenizer()
    return mose.detokenize(corrected, return_str=True)
예제 #13
0
    def __init__(
        self,
        separate_sentences=" ",  # moses can be suitable for prose if we don't insert newlines
        separate_words=" "):
        super(JoinerNLTK, self).__init__(separate_sentences=separate_sentences,
                                         separate_words=separate_words)

        self.detokenizer = MosesDetokenizer(lang="en")
예제 #14
0
def generate_answers(session, model, word2id, qn_uuid_data, context_token_data, qn_token_data):
    """
    Given a model, and a set of (context, question) pairs, each with a unique ID,
    use the model to generate an answer for each pair, and return a dictionary mapping
    each unique ID to the generated answer.

    Inputs:
      session: TensorFlow session
      model: QAModel
      word2id: dictionary mapping word (string) to word id (int)
      qn_uuid_data, context_token_data, qn_token_data: lists

    Outputs:
      uuid2ans: dictionary mapping uuid (string) to predicted answer (string; detokenized)
    """
    uuid2ans = {} # maps uuid to string containing predicted answer
    data_size = len(qn_uuid_data)
    num_batches = ((data_size-1) / model.FLAGS.batch_size) + 1
    batch_num = 0
    detokenizer = MosesDetokenizer()

    print "Generating answers..."

    for batch in get_batch_generator(word2id, qn_uuid_data, context_token_data, qn_token_data, model.FLAGS.batch_size, model.FLAGS.context_len, model.FLAGS.question_len):

        # Get the predicted spans
        pred_start_batch, pred_end_batch = model.get_start_end_pos(session, batch)

        # Convert pred_start_batch and pred_end_batch to lists length batch_size
        pred_start_batch = pred_start_batch.tolist()
        pred_end_batch = pred_end_batch.tolist()

        # For each example in the batch:
        for ex_idx, (pred_start, pred_end) in enumerate(zip(pred_start_batch, pred_end_batch)):

            # Original context tokens (no UNKs or padding) for this example
            context_tokens = batch.context_tokens[ex_idx] # list of strings

            # Check the predicted span is in range
            assert pred_start in range(len(context_tokens))
            assert pred_end in range(len(context_tokens))

            # Predicted answer tokens
            pred_ans_tokens = context_tokens[pred_start : pred_end +1] # list of strings

            # Detokenize and add to dict
            uuid = batch.uuids[ex_idx]
            uuid2ans[uuid] = detokenizer.detokenize(pred_ans_tokens, return_str=True)

        batch_num += 1

        if batch_num % 10 == 0:
            print "Generated answers for %i/%i batches = %.2f%%" % (batch_num, num_batches, batch_num*100.0/num_batches)

    print "Finished generating answers for dataset."

    return uuid2ans
예제 #15
0
def toDataFrame(tweets):
    # convert to dataframe
    DataSet = pd.DataFrame()

    # add parameters
    DataSet['tweetID'] = [tweet.id for tweet in tweets]
    DataSet['datetime'] = [tweet.created_at for tweet in tweets]
    DataSet['date'] = DataSet.datetime.dt.date
    DataSet['hour'] = DataSet.datetime.dt.hour
    DataSet['minute'] = DataSet.datetime.dt.minute
    DataSet['dayofweek'] = DataSet.datetime.dt.weekday_name
    DataSet['tweetRetweetCt'] = [tweet.retweet_count for tweet in tweets]
    DataSet['tweetFavoriteCt'] = [tweet.favorite_count for tweet in tweets]
    DataSet['tweetSource'] = [tweet.source for tweet in tweets]
    DataSet['userID'] = [tweet.user.id for tweet in tweets]
    DataSet['userScreen'] = [tweet.user.screen_name for tweet in tweets]
    DataSet['userName'] = [tweet.user.name for tweet in tweets]
    DataSet['userCreateDt'] = [tweet.user.created_at for tweet in tweets]
    DataSet['userDesc'] = [tweet.user.description for tweet in tweets]
    DataSet['userFollowerCt'] = [
        tweet.user.followers_count for tweet in tweets
    ]
    DataSet['userFriendsCt'] = [tweet.user.friends_count for tweet in tweets]
    DataSet['userLocation'] = [tweet.user.location for tweet in tweets]
    DataSet['userTimezone'] = [tweet.user.time_zone for tweet in tweets]
    DataSet['tweetText'] = [tweet.full_text for tweet in tweets]

    # tokenize tweetsText, and filter for stop words
    detokenizer = MosesDetokenizer()
    noStopWords = []
    for i in tweets:
        word_tokens = word_tokenize(i.full_text)
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        noStopWords.append(
            detokenizer.detokenize(filtered_sentence, return_str=True))
    DataSet['tweetNoSW'] = noStopWords

    # sentiment analysis
    analyzer = SentimentIntensityAnalyzer()
    DataSet['sentimentPos'] = [
        analyzer.polarity_scores(tweet)['pos']
        for tweet in DataSet['tweetNoSW']
    ]
    DataSet['sentimentNeut'] = [
        analyzer.polarity_scores(tweet)['neu']
        for tweet in DataSet['tweetNoSW']
    ]
    DataSet['sentimentNeg'] = [
        analyzer.polarity_scores(tweet)['neg']
        for tweet in DataSet['tweetNoSW']
    ]
    DataSet['sentimentComp'] = [
        analyzer.polarity_scores(tweet)['compound']
        for tweet in DataSet['tweetNoSW']
    ]
    return DataSet
예제 #16
0
 def __init__(self):
     try:
         from nltk.tokenize.moses import MosesDetokenizer
     except ImportError:
         raise ImportError(
             'NLTK or relevant packages are not installed. You must install NLTK '
             'in order to use the NLTKMosesTokenizer. You can refer to the '
             'official installation guide in https://www.nltk.org/install.html .'
         )
     self._detokenizer = MosesDetokenizer()
예제 #17
0
def generate_answers_from_dist(sess, model, total_dict, word2id, qn_uuid_data,
                               context_token_data, qn_token_data):
    """
    Given a model, and a set of (context, question) pairs, each with a unique ID,
    use the model to generate an answer for each pair, and return a dictionary mapping
    each unique ID to the generated answer.

    Inputs:
      session: TensorFlow session
      total_dict: dict uuid -> distributions
      word2id: dictionary mapping word (string) to word id (int)
      qn_uuid_data, context_token_data, qn_token_data: lists

    Outputs:
      uuid2ans: dictionary mapping uuid (string) to predicted answer (string; detokenized)
    """
    uuid2ans = {}  # maps uuid to string containing predicted answer
    data_size = len(qn_uuid_data)
    num_batches = ((data_size - 1) / model.FLAGS.batch_size) + 1
    batch_num = 0
    detokenizer = MosesDetokenizer()

    print "Generating answers..."

    for batch in get_batch_generator(word2id, qn_uuid_data, context_token_data,
                                     qn_token_data, model.FLAGS.batch_size,
                                     model.FLAGS.context_len,
                                     model.FLAGS.question_len):
        # For each example in the batch:
        for (ex_idx, uuid) in enumerate(batch.uuids):
            pred_start = np.argmax(total_dict[uuid][0])
            pred_end = np.argmax(total_dict[uuid][1])

            # Original context tokens (no UNKs or padding) for this example
            context_tokens = batch.context_tokens[ex_idx]  # list of strings

            # Check the predicted span is in range
            assert pred_start in range(len(context_tokens))
            assert pred_end in range(len(context_tokens))

            # Predicted answer tokens
            pred_ans_tokens = context_tokens[pred_start:pred_end +
                                             1]  # list of strings
            uuid2ans[uuid] = detokenizer.detokenize(pred_ans_tokens,
                                                    return_str=True)

        batch_num += 1

        if batch_num % 10 == 0:
            print "Generated answers for %i/%i batches = %.2f%%" % (
                batch_num, num_batches, batch_num * 100.0 / num_batches)

    print "Finished generating answers for dataset."

    return uuid2ans
예제 #18
0
    def __init__(self,
                 sess,
                 model_name,
                 dataset_name,
                 checkpoint,
                 char_emb=False,
                 fix_problems=False):
        """Prepare the model's dataset and trained model."""
        os.makedirs(os.path.join('training', 'data', 'dataset', dataset_name),
                    exist_ok=True)
        CURRENT_PATH = os.path.dirname(os.path.realpath(__file__))
        TRAINING_PATH = os.path.join(CURRENT_PATH, 'training')

        data_dir = os.path.join(TRAINING_PATH, 'data', 'dataset', dataset_name)
        model_dir = os.path.join(TRAINING_PATH, 'model', model_name)

        self.hparams = utils.load_hparams(
            os.path.join(model_dir, 'hparams.json'))

        self.detokenizer = MosesDetokenizer()

        self.char_emb = char_emb
        self.normalizer = predictor.Predictor(sess,
                                              dataset_dir=data_dir,
                                              output_dir=model_dir,
                                              output_file=checkpoint,
                                              hparams=self.hparams)
        self.fix_problems = fix_problems
        if self.fix_problems:
            ACCENT_PATH = os.path.join(TRAINING_PATH, 'data',
                                       'accented_words.dic')
            PANDIWA_PATH = os.path.join(TRAINING_PATH, 'data', 'pandiwa.dic')

            accent_words_dict = csv_to_dict(ACCENT_PATH)
            accent_words_dict = {
                v2: k
                for k, v in accent_words_dict.items() for v2 in v
            }

            pprint(accent_words_dict)
            with open(PANDIWA_PATH, 'r') as pandiwa_file:
                pandiwa_words_dict = pandiwa_file.read().splitlines()

            with open(os.path.join(TRAINING_PATH, 'data', 'hyph_fil.tex'),
                      'r') as f:
                hyphenator_dict = f.read()

            self.spell_corrector = SpellCorrector(dict_path=os.path.join(
                TRAINING_PATH, 'data', 'corpus', 'merged_bicol.txt'))

            self.t_normalizer = TextNormalizer(
                accent_words_dict=accent_words_dict,
                hyphenator_dict=hyphenator_dict,
                pandiwa_words_dict=pandiwa_words_dict,
                spell_corrector=self.spell_corrector)
예제 #19
0
def detokenize(tokens, start, end):
    """
  Given a list of tokens, take the tokens from index start to index end and detokenize them
  """
    if end < start:
        return ''
    else:
        tokens = tokens[start:end + 1]
        detokenizer = MosesDetokenizer()
        return detokenizer.detokenize(
            [token.decode('utf-8') for token in tokens], return_str=True)
def spacyMethod(nq):
    #NLTK/SPACY METHOD
    nq = nq.replace('-',' ').lower()
    detokenizer = MosesDetokenizer()
    ent_list = nltk.word_tokenize(nq)
    #Spacy Stop Word Removal
    for ind,ent in enumerate(ent_list):
        if ent in STOP_WORDS and ent != 'name':
            del ent_list[ind]
    detokenizer.detokenize(ent_list, return_str=True)
    return(ent_list)
예제 #21
0
    def __init__(self, opt):
        self.opt = opt
        self.sep = opt.seprator + " "
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"),
                       opt.seprator, None, None)

        self.tokenizer = MosesTokenizer()
        self.detokenizer = MosesDetokenizer()
        self.translator = onmt.Translator(opt)
def normalize(post):
    reaction = get_most_common_reaction(post['reactions'])
    message = word_tokenize(post['message'])
    sw = stopwords.words('english')

    words_without_stopwords = [word for word in message if word not in sw]

    detokenizer = MosesDetokenizer()
    message_without_stopwords = detokenizer.detokenize(words_without_stopwords,
                                                       return_str=True)

    return {'message': message_without_stopwords, 'reaction': reaction}
예제 #23
0
def generate_answers(config, model, processor, qn_uuid_data,
                     context_token_data, qn_token_data):
    uuid2ans = {}  # maps uuid to string containing predicted answer
    data_size = len(qn_uuid_data)
    num_batches = ((data_size - 1) / config.batch_size) + 1
    batch_num = 0
    detokenizer = MosesDetokenizer()

    print "Generating answers..."

    for batch in get_batch_generator(processor.word2id, qn_uuid_data,
                                     context_token_data, qn_token_data,
                                     config.batch_size, config.context_len,
                                     config.question_len):

        # Get the predicted spans
        pred_start_batch, pred_end_batch = processor.test_one_batch(
            batch, model)

        # Convert pred_start_batch and pred_end_batch to lists length batch_size
        pred_start_batch = pred_start_batch.tolist()
        pred_end_batch = pred_end_batch.tolist()

        # For each example in the batch:
        for ex_idx, (pred_start, pred_end) in enumerate(
                zip(pred_start_batch, pred_end_batch)):

            # Original context tokens (no UNKs or padding) for this example
            context_tokens = batch.context_tokens[ex_idx]  # list of strings

            # Check the predicted span is in range
            assert pred_start in range(len(context_tokens))
            assert pred_end in range(len(context_tokens))

            # Predicted answer tokens
            pred_ans_tokens = context_tokens[pred_start:pred_end +
                                             1]  # list of strings

            # Detokenize and add to dict
            uuid = batch.uuids[ex_idx]
            uuid2ans[uuid] = detokenizer.detokenize(pred_ans_tokens,
                                                    return_str=True)

        batch_num += 1

        if batch_num % 10 == 0:
            print "Generated answers for %i/%i batches = %.2f%%" % (
                batch_num, num_batches, batch_num * 100.0 / num_batches)

    print "Finished generating answers for dataset."

    return uuid2ans
예제 #24
0
def tokenize_text(text):
    # Tokenizers are basically an advanced split
    tokenizer = MosesTokenizer()
    detokenizer = MosesDetokenizer()

    processed_text = tokenizer.tokenize(text)

    # Need to detokenize to get all the weird symbols back as symbols
    processed_text = detokenizer.detokenize(processed_text)

    processed_text = preprocess(processed_text)

    return " ".join(processed_text)
예제 #25
0
def extract(sent, ind):
    j = 0
    trial.append(sent)
    length = len(ind)
    summary = []
    #summary.append(detoken[0][0])
    for j in range(length):
        summary.append(sent[ind[j]])
        detokenizer = MosesDetokenizer()
        hello = detokenizer.detokenize(summary, return_str=True)
        #print(hello)
    main.append(hello)
    construct(hello)
def generate_answers(session, model, word2id, qn_uuid_data, context_token_data,
                     qn_token_data):
    """
    Given a model, and a set of (context, question) pairs, each with a unique ID,
    use the model to generate an answer for each pair, and return a dictionary mapping
    each unique ID to the generated answer.
    """
    uuid2ans = {}
    data_size = len(qn_uuid_data)
    num_batches = ((data_size - 1) / model.FLAGS.batch_size) + 1
    batch_num = 0
    detokenizer = MosesDetokenizer()

    print("Generating answers...")

    for batch in get_batch_generator(word2id, qn_uuid_data, context_token_data,
                                     qn_token_data, model.FLAGS.batch_size,
                                     model.FLAGS.context_len,
                                     model.FLAGS.question_len):

        pred_start_batch, pred_end_batch = model.get_start_end_pos(
            session, batch)

        pred_start_batch = pred_start_batch.tolist()
        pred_end_batch = pred_end_batch.tolist()

        for ex_idx, (pred_start, pred_end) in enumerate(
                zip(pred_start_batch, pred_end_batch)):

            context_tokens = batch.context_tokens[ex_idx]

            assert pred_start in range(len(context_tokens))
            assert pred_end in range(len(context_tokens))

            pred_ans_tokens = context_tokens[pred_start:pred_end + 1]

            uuid = batch.uuids[ex_idx]
            uuid2ans[uuid] = detokenizer.detokenize(pred_ans_tokens,
                                                    return_str=True)

        batch_num += 1

        if batch_num % 10 == 0:
            print("Generated answers for %i/%i batches = %.2f%%" %
                  (batch_num, num_batches, batch_num * 100.0 / num_batches))

    print("Finished generating answers for dataset.")

    return uuid2ans
예제 #27
0
def preprocessing(data, row_name):
    """
    Reads reviews in from csv and preprocesses the
    
    Parameters
    --------------------
        data  -- data frame 
        row_name -- name of row containing reviews
    
    Returns
    --------------------
        df -- dataframe with preprocessed reviews
    """

    #create tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # create English stop words list
    stop_words = set(stopwords.words('english'))

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    # create list to store preprocessed text
    new_data = []

    for index, row in data.iterrows():
        # lower case the text
        lower_case = row[row_name].lower()

        #tokenize the text (removes punctuation)
        tokens = tokenizer.tokenize(lower_case)

        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in stop_words]

        #stemming
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

        #put it back into a string
        detokenizer = MosesDetokenizer()

        detokenized_text = detokenizer.detokenize(stemmed_tokens,
                                                  return_str=True)
        new_data.append(detokenized_text)

    #data frame of reviews
    df = pd.DataFrame({'review': new_data})
    return df
    def build_corpus(model_data):
        #import libraries
        import nltk
        from nltk.tokenize import TweetTokenizer
        from nltk.corpus import stopwords  #to use stopwords function and list
        from nltk.stem.porter import PorterStemmer
        from nltk.tokenize.moses import MosesDetokenizer

        #setting up tweets for train/test split
        #initialize corpus
        corpus = []

        #create instances of tweettokenizer, detokenizer, and porterstemmer.
        #initialize pattern to filter urls and usernames
        twtoken = TweetTokenizer()
        detokenizer = MosesDetokenizer()
        ps = PorterStemmer()
        url_pattern = re.compile(r'https\S+')
        user_pattern = re.compile(r'@\S+')

        #build corpus
        for i in range(model_data.shape[0]):
            text = model_data['Text'][i]
            urls = re.findall(url_pattern, text)
            users = re.findall(user_pattern, text)
            users = [re.sub('[^@a-zA-z]', '', user) for user in users]
            text = twtoken.tokenize(text)
            for url in urls:
                if url in text:
                    text.remove(url)
            for user in users:
                if user in text:
                    text.remove(user)
            text = detokenizer.detokenize(text, return_str=True)
            text = re.sub('[^a-zA-z]', ' ', text)
            text = text.lower()
            text = text.split()
            try:
                text.remove('makeovermonday')
            except Exception:
                pass
            text = [
                ps.stem(word) for word in text
                if not word in set(stopwords.words('english'))
            ]
            text = ' '.join(text)
            corpus.append(text)

        return corpus
예제 #29
0
def process_lines(cmd_args, src, ref, hyps, truecase_dict):
    """ Traverse all sentences from all sources. """

    re_bpe = re.compile("@@ ")
    if cmd_args.google:
        try:
            import mtranslate
        except ImportError:
            print(
                "Error: Install package 'mtranslate': pip install --user -U mtranslate"
            )
            exit(2)

    if cmd_args.detok:
        try:
            from nltk.tokenize.moses import MosesDetokenizer
            detok = MosesDetokenizer(lang=cmd_args.detok)
        except ImportError:  # NLTK isn't installed
            print("Error: Install package 'nltk': pip install --user -U nltk")
            exit(3)
        except LookupError:  # NLTK's data package perluniprops isn't installed
            print("Error: Install NLTK data package 'perluniprops': \
                   import nltk; nltk.download('perluniprops')")
            exit(4)

    #all_lines = {ref:[]}
    line_num = 0
    for src_line in src:
        line_num += 1
        if line_num > cmd_args.head:
            break

        src_line = process_line(cmd_args, src_line, re_bpe, detok,
                                truecase_dict)
        print("Src:     %s" % src_line)

        ref_line = ''
        if ref is not None:
            ref_line = ref.readline()
            ref_line = process_line(cmd_args, ref_line, re_bpe, detok,
                                    truecase_dict)
            #all_lines[ref.append(ref_line)]
            print("Ref:    ", ref_line)

        hyp_num = 1
        for hyp in hyps:
            hyp_line = hyp.readline()
            hyp_line = process_line(cmd_args, hyp_line, re_bpe, detok,
                                    truecase_dict)
            print_hyp(hyp_num, ref_line, hyp_line)
            hyp_num += 1

        if cmd_args.google:
            google_line = mtranslate.translate(src_line, cmd_args.google)
            # Don't truecase Google output
            google_line = process_line(cmd_args, google_line, re_bpe, detok,
                                       {})
            print_hyp(hyp_num, ref_line, google_line)

        print()
예제 #30
0
def run():
    detokenizer = MosesDetokenizer()
    with open('data.json') as f:
        data = json.load(f)
    contexts = data['contexts']
    questions = data['questions']
    predictions = []
    for c, qs in tqdm(zip(contexts, questions), total=len(contexts)):
        if len(c) == 1:
            continue
        # Get vector embedding of sentence
        # Find the most similar sentence in the context
        for q in qs:
            predictions.append(
                detokenizer.detokenize(random.choice(c), return_str=True))
    return predictions
예제 #31
0
    def preprocess(cls, to_dir='../../data/billion', max_line=None):
        # detokenizer = MosesDetokenizer()

        count = 0
        with MosesDetokenizer('en') as detokenize, \
                open(os.path.join(to_dir, 'train.src'), 'w+') as train_f, \
                open(os.path.join(to_dir, 'val.src'), 'w+') as val_f, \
                open(os.path.join(to_dir, 'test.src'), 'w+') as test_f:
            for root, dirs, files in os.walk(cls.path):
                for file in files:
                    with open(os.path.join(root, file), 'r') as in_f:
                        for line in in_f:
                            # 重建原本的sentence(為了subword tokenizer)
                            # line = detokenize(line.rstrip().split(' '))
                            # line += '\n'

                            if count % 20 == 0:
                                test_f.write(line)
                            elif count % 10 == 0:
                                val_f.write(line)
                            else:
                                train_f.write(line)

                            if max_line is not None and count > max_line:
                                break

                            if count % 100000 == 0:
                                print(count)

                            count += 1
예제 #32
0
def normalize_text(html):
    try:
        url_re = re.compile("https{0,1}://[^\s]+")
        url2_re = re.compile("[a-z0-9\.]+\.[a-z0-9\.]+/[^\s]*")
        space_re = re.compile("[\s]{2,}")

        html = html.encode("ascii", errors="ignore")
        text = newspaper.fulltext(html)
        
        sent = text.encode('ascii', errors='ignore')
        sent = str(sent).replace("r\\", "")
        sent = str(sent).replace("n\\", "")
        sent = str(sent).replace("\\", "")
        text = sent

        t, d = MosesTokenizer(), MosesDetokenizer()
        tokens = t.tokenize(text)
        detokens = d.detokenize(tokens)
        text = " ".join(detokens)
            # Removing URLs
        text = url_re.sub(" ", text)
        text = url2_re.sub(" ", text)
            
        # Removing multiple spacing characters
        text = space_re.sub(" ", text)

        text = text.encode("ascii", errors="ignore").decode()
        text = preProcess(text)
            # Stripping leading and trailing spaces
        text = text.strip()
        return text
    except Exception as e:
        return ""
예제 #33
0
class NLTKMosesTokenizer(Component):
    """Class for splitting texts on tokens using NLTK wrapper over MosesTokenizer

    Attributes:
        escape: whether escape characters for use in html markup
        tokenizer: tokenizer instance from nltk.tokenize.moses
        detokenizer: detokenizer instance from nltk.tokenize.moses

    Args:
        escape: whether escape characters for use in html markup
    """

    def __init__(self, escape: bool=False, *args, **kwargs):
        self.escape = escape
        self.tokenizer = MosesTokenizer()
        self.detokenizer = MosesDetokenizer()

    def __call__(self, batch: List[Union[str, List[str]]]) -> List[Union[List[str], str]]:
        """Tokenize given batch of strings or detokenize given batch of lists of tokens

        Args:
            batch: list of text samples or list of lists of tokens

        Returns:
            list of lists of tokens or list of text samples
        """
        if isinstance(batch[0], str):
            return [self.tokenizer.tokenize(line, escape=self.escape) for line in batch]
        else:
            return [self.detokenizer.detokenize(line, return_str=True, unescape=self.escape)
                    for line in batch]
예제 #34
0
class NLTKMosesDetokenizer(object):
    r"""Apply the Moses Detokenizer implemented in NLTK.

    Users of this class are required to `install NLTK <https://www.nltk.org/install.html>`_
    and install relevant NLTK packages, such as
    :samp:`python -m nltk.downloader perluniprops nonbreaking_prefixes`

    Examples
    --------
    >>> detokenizer = gluonnlp.data.NLTKMosesDetokenizer()
    >>> detokenizer(['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of',
    ...              'text', 'processing', 'tools', '.'], return_str=True)
    'Gluon NLP toolkit provides a suite of text processing tools.'
    >>> detokenizer(['Das', 'Gluon','NLP-Toolkit','stellt','eine','Reihe','von',
    ...              'Textverarbeitungstools','zur','Verfügung','.'], return_str=True)
    'Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools zur Verfügung.'
    """
    def __init__(self):
        try:
            from nltk.tokenize.moses import MosesDetokenizer
        except ImportError:
            warnings.warn('NLTK or relevant packages are not installed. '
                          'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. '
                          'You must install NLTK <= 3.2.5 in order to use the '
                          'NLTKMosesDetokenizer. You can refer to the official '
                          'installation guide in https://www.nltk.org/install.html .'
                          ' Now try SacreMosesDetokenizer using sacremoses ...')
            try:
                from sacremoses import MosesDetokenizer
            except ImportError:
                raise ImportError('sacremoses is also not installed. '
                                  'Please use sacremoses or older nltk version, e.g. 3.2.5. '
                                  'To install sacremoses, use pip install -U sacremoses')
        try:
            self._detokenizer = MosesDetokenizer()
        except ValueError:
            raise ValueError('The instantiation of MosesDetokenizer in sacremoses is'
                             ' currently only supported in python3.')

    def __call__(self, sample, return_str=False):
        """

        Parameters
        ----------
        sample: list(str)
            The sentence to detokenize
        return_str: bool, default False
            True: return a single string
            False: return a list of words

        Returns
        -------
        ret : list of strs or str
            List of words or detokenized text
        """
        return self._detokenizer.detokenize(sample, return_str=return_str)
예제 #35
0
 def __init__(self):
     try:
         from nltk.tokenize.moses import MosesDetokenizer
     except ImportError:
         warnings.warn('NLTK or relevant packages are not installed. '
                       'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. '
                       'You must install NLTK <= 3.2.5 in order to use the '
                       'NLTKMosesDetokenizer. You can refer to the official '
                       'installation guide in https://www.nltk.org/install.html .'
                       ' Now try SacreMosesDetokenizer using sacremoses ...')
         try:
             from sacremoses import MosesDetokenizer
         except ImportError:
             raise ImportError('sacremoses is also not installed. '
                               'Please use sacremoses or older nltk version, e.g. 3.2.5. '
                               'To install sacremoses, use pip install -U sacremoses')
     try:
         self._detokenizer = MosesDetokenizer()
     except ValueError:
         raise ValueError('The instantiation of MosesDetokenizer in sacremoses is'
                          ' currently only supported in python3.')
예제 #36
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import nltk

data = ["Hi", ",", "my", "name", "is", "Bob", "!"]
if nltk.__version__ == "3.2.2":
    from nltk.tokenize.moses import MosesDetokenizer # nltk 3.2.2
    detokenizer = MosesDetokenizer()
    sent = detokenizer.detokenize(data, return_str=True)

elif nltk.__version__ == "3.3":
    from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer # nltk 3.3
    detokenizer = TreebankWordDetokenizer()
    sent = detokenizer.detokenize(data)

else:
    exit()

print(sent)
예제 #37
0
class SacreMosesDetokenizer(object):
    r"""Apply the Moses Detokenizer implemented in sacremoses.

    Users of this class are required to `install sacremoses
    <https://github.com/alvations/sacremoses>`_. For example, one can use
    :samp:`pip install sacremoses`.

    .. note::
        sacremoses carries an LGPL 2.1+ license.

    Examples
    --------
    >>> detokenizer = gluonnlp.data.SacreMosesDetokenizer()
    >>> detokenizer(['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of',
    ...              'text', 'processing', 'tools', '.'], return_str=True)
    'Gluon NLP toolkit provides a suite of text processing tools.'
    >>> detokenizer(['Das', 'Gluon','NLP-Toolkit','stellt','eine','Reihe','von',
    ...              'Textverarbeitungstools','zur','Verfügung','.'], return_str=True)
    'Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools zur Verfügung.'
    """
    def __init__(self):
        try:
            from sacremoses import MosesDetokenizer
            self._detokenizer = MosesDetokenizer()
        except (ImportError, TypeError) as err:
            if isinstance(err, TypeError):
                warnings.warn('The instantiation of MosesDetokenizer in sacremoses is'
                              ' currently only supported in python3.'
                              ' Now try NLTKMosesDetokenizer using NLTK ...')
            else:
                warnings.warn('sacremoses is not installed. '
                              'To install sacremoses, use pip install -U sacremoses'
                              ' Now try NLTKMosesDetokenizer using NLTK ...')
            try:
                import nltk
                try:
                    nltk.data.find('perluniprops')
                except LookupError:
                    nltk.download('perluniprops')
                from nltk.tokenize.moses import MosesDetokenizer
                self._detokenizer = MosesDetokenizer()
            except ImportError:
                raise ImportError('NLTK is not installed. '
                                  'You must install NLTK <= 3.2.5 in order to use the '
                                  'NLTKMosesDetokenizer. You can refer to the official '
                                  'installation guide in https://www.nltk.org/install.html .')

    def __call__(self, sample, return_str=False):
        """

        Parameters
        ----------
        sample: list(str)
            The sentence to detokenize
        return_str: bool, default False
            True: return a single string
            False: return a list of words

        Returns
        -------
        ret : list of strs or str
            List of words or detokenized text
        """
        return self._detokenizer.detokenize(sample, return_str=return_str)
예제 #38
0
def detokenize(line):
    tokens = line.replace(" n't", "n't").split(' ')
    detokenizer = MosesDetokenizer()
    res = detokenizer.detokenize(tokens, return_str=True)
    res = res[0].upper() + res[1:]
    return res