Python process_text示例，utils.process_text Python示例

示例#1

0

显示文件

文件： db_models.py 项目： chabanovsky/so-comment-moderator

    def processed_rude_words():
        if CommentStaticData.processed_rude_word_list is not None:
            return CommentStaticData.processed_rude_word_list

        CommentStaticData.processed_rude_word_list = [word for word in process_text(u' '.join(CommentStaticData.rude_word_list)).split(' ') if len(word.strip()) > 0]
        CommentStaticData.processed_rude_word_list.extend([word for word in process_text(u' '.join(CommentStaticData.additional_stop_words)).split(' ') if len(word.strip()) > 0])
        return CommentStaticData.processed_rude_word_list

示例#2

0

显示文件

def store_qas(dataset, qas, vocab, max_length=20):
    total = len(qas)
    questions = dataset.create_dataset(
            'questions', (total, args.max_length), dtype='i')
    answers = dataset.create_dataset(
            'answers', (total, args.max_length), dtype='i')
    categories = dataset.create_dataset(
            'categories', (total,), dtype='i')
    image_indices = dataset.create_dataset(
            'image_indices', (total,), dtype='i')

    image_ids = []
    bar = progressbar.ProgressBar(maxval=len(qas))
    for idx, entry in enumerate(qas):
        i_image = len(image_ids)
        if entry['image_id'] in image_ids:
            i_image = image_ids.index(entry['image_id'])
        else:
            image_ids.append(entry['image_id'])
        image_indices[idx] = i_image
        categories[idx] = entry['category']
        q, length = process_text(entry['question'].encode('utf-8'), vocab,
                              max_length=max_length)
        questions[idx, :length] = q
        a, length = process_text(entry['answer'].encode('utf-8'), vocab,
                              max_length=max_length)
        answers[idx, :length] = a
        bar.update(idx)
    return image_ids

示例#3

0

显示文件

文件： warc_compare.py 项目： stites/WARC-diff-tools

    def calculate_similarity(self,
                             minhash=True,
                             simhash=True,
                             sequence_match=True,
                             shingle_settings=shingle_settings):
        """
        checking all common resources for changes
        image checking is broken for now, requires a separate handling

        :param minhash: True or False, default True
        :param simhash: True or False, default True
        :param sequence_match: True or False, default True
        :param shingle_settings: see `shingle_settings` in toggles.py

        :return:
            { resource_url_path:
                "hash_change" : True or False (sha1 change)
                "minhash": minhash_coefficient,
                "simhash": simhash_distance,
            }
        """
        compared = dict()
        for content_type in self.resources['modified'].keys():
            for url in self.resources['modified'][content_type]:
                compared[url] = {}

                p1 = utils.get_payload(url, self.warc1)
                p2 = utils.get_payload(url, self.warc2)

                dp1 = utils.decompress_payload(p1)
                dp2 = utils.decompress_payload(p2)

                cleaned_dp1 = utils.process_text(dp1)
                cleaned_dp2 = utils.process_text(dp2)

                # shingle cleaned text
                shingles1 = utils.shingle(cleaned_dp1,
                                          shingle_settings=shingle_settings)
                shingles2 = utils.shingle(cleaned_dp2,
                                          shingle_settings=shingle_settings)

                if minhash:
                    compared[url]['minhash'] = utils.get_minhash(
                        shingles1, shingles2)

                if simhash:
                    compared[url]['simhash'] = utils.get_simhash(
                        shingles1, shingles2)

                if sequence_match:
                    compared[url]['sequence_matched'] = utils.sequence_match(
                        cleaned_dp1, cleaned_dp2)

        return compared

示例#4

0

显示文件

文件： csv_data_uploader.py 项目： chabanovsky/so-comment-moderator

    def make_site_comment_params(parsed_args, verified, verified_user_id,
                                 is_rude):
        comment_id, body, post_id, post_title, score, parent_post_id, creation_date, author_id, author_username, post_author_id, diff_with_post = parsed_args
        question_id = -1
        answer_id = -1
        if parent_post_id > 0:
            question_id = parent_post_id
            answer_id = post_id
        else:
            question_id = post_id

        return {
            'comment_id': comment_id,
            'question_id': question_id,
            'answer_id': answer_id,
            'post_author_id': post_author_id,
            'post_score': score,
            'body': body,
            'title': post_title,
            'processed_body': process_text(body),
            'creation_date': creation_date,
            'author_id': author_id,
            'author_name': author_username,
            'verified': verified,
            'verified_user_id': verified_user_id,
            'is_rude': is_rude,
            'diff_with_post': diff_with_post
        }

示例#5

0

显示文件

文件： run.py 项目： pawanmsr/fine-tune-glove

def run(args):
    doc = read_txt(args.path_to_doc)
    doc_tokens = [
        process_text(entry,
                     lower=not args.cased,
                     remove_stopwords=args.remove_stopwords,
                     remove_punctuation=args.remove_punctuation)
        for entry in doc
    ]

    all_tokens = []
    for entry_tokens in doc_tokens:
        all_tokens += entry_tokens

    rare_tokens, selected_tokens = get_rare_tokens(all_tokens,
                                                   args.min_freq,
                                                   args.max_tokens,
                                                   return_non_rare=True)
    if args.remove_rare:
        doc_tokens = [
            filter_tokens(entry_tokens, set(rare_tokens))
            for entry_tokens in doc_tokens
        ]

    gu = GloVeUtility(args.path_to_glove)

    vectorizer = CountVectorizer(ngram_range=(args.ngram_lower,
                                              args.ngram_upper),
                                 vocabulary=selected_tokens)
    count_vector = vectorizer.fit_transform(
        [" ".join(entry_tokens) for entry_tokens in doc_tokens])

    csr_mat = count_vector.T * count_vector
    csr_mat.setdiag(0)

    cooccur_ar = csr_mat.toarray()

    mittens_model = Mittens(n=gu.d, max_iter=args.iter)
    embeddings = mittens_model.fit(cooccur_ar,
                                   vocab=selected_tokens,
                                   initial_embedding_dict=gu.vector_dict)

    filename = args.path_to_glove.split(os.path.sep)[-1]
    os.makedirs(args.output, exist_ok=True)

    embeddings_dict = dict(zip(selected_tokens, embeddings))
    progress_bar.std_print("\nTrained on {} tokens.".format(
        len(embeddings_dict)))

    if args.save_new_only:
        savepath = os.path.join(args.output, "new_" + filename)
        embeddings_list = [
            " ".join([key] + [str(val) for val in embeddings_dict[key]])
            for key in embeddings_dict
        ]
        write_txt(savepath, embeddings_list)
    else:
        savepath = os.path.join(args.output, filename)
        gu.add_replace_vectors(embeddings_dict)
        gu.save_vectors(savepath)

示例#6

0

显示文件

def get_data_to_buffer():
    buffer = list()
    text = process_text(os.path.join("data", "train.txt"))
    cnt = 0

    start = time.perf_counter()
    for i in range(len(text)):
        cnt += 1
        mel_gt_name = os.path.join(hparams.mel_ground_truth,
                                   "ljspeech-mel-%05d.npy" % (i + 1))
        mel_gt_target = np.load(mel_gt_name)
        duration = np.load(
            os.path.join(hparams.alignment_path,
                         str(i) + ".npy"))
        character = text[i][0:len(text[i]) - 1]
        character = np.array(text_to_sequence(character,
                                              hparams.text_cleaners))

        character = torch.from_numpy(character)
        duration = torch.from_numpy(duration)
        mel_gt_target = torch.from_numpy(mel_gt_target)

        buffer.append({
            "text": character,
            "duration": duration,
            "mel_target": mel_gt_target
        })

        if cnt % 1000 == 0:
            print("{:d} record has been processed.".format(cnt))

    end = time.perf_counter()
    print("cost {:.2f}s to load all data into buffer.".format(end - start))

    return buffer

示例#7

0

显示文件

def main1():
    path = os.path.join("data", "LJSpeech-1.1")
    #preprocess_ljspeech(path)

    text_path = os.path.join("data", "train.txt")
    texts = process_text(text_path)

    if not os.path.exists(hp.alignment_path):
        os.mkdir(hp.alignment_path)

    tacotron2 = get_Tacotron2()
    num = 0
    for ind, text in enumerate(texts[num:]):

        if (ind > 10):
            exit(0)

        character = text[0:len(text) - 1]
        mel_gt_name = os.path.join(hp.mel_ground_truth,
                                   "ljspeech-mel-%05d.npy" % (ind + num + 1))
        mel_gt_target = np.load(mel_gt_name)

        _, _, D = load_data(character, mel_gt_target, tacotron2)

        np.save(os.path.join(hp.alignment_path,
                             str(ind + num) + ".npy"),
                D,
                allow_pickle=False)

示例#8

0

显示文件

文件： analyse.py 项目： NickVolynkin/flag_words

def parse_question_file(filename="questions.csv"):
    full_text = ""
    with open(filename, 'rt', encoding="utf8") as csvfile:
        csv_reader = csv.reader(csvfile, delimiter=',')
        for row in csv_reader:
            _, _, _, title, body, tags = row
            full_text += " " + process_text(body)
    return full_text

示例#9

0

显示文件

文件： analyse.py 项目： NickVolynkin/flag_words

def parse_answer_file(filename="answers.csv"):
    full_text = ""
    with open(filename, 'rt', encoding="utf8") as csvfile:
        csv_reader = csv.reader(csvfile, delimiter=',')
        for row in csv_reader:
            _, _, body, _ = row
            full_text += " " + process_text(body)
    return full_text

示例#10

0

显示文件

文件： analyse.py 项目： NickVolynkin/flag_words

def parse_comment_file(filename="comments.csv"):
    full_text = ""
    with open(filename, 'rt', encoding="utf8") as csvfile:
        csv_reader = csv.reader(csvfile, delimiter=',')
        for row in csv_reader:
            body, post_id = row
            full_text += " " + process_text(body)
    return full_text

示例#11

0

显示文件

文件： substitution.py 项目： rpassas/python_projects

def decrypt(text, key):
    '''
    Function -- decrypt
    decrypts cipher text by replacing letters in the text with
    letters in the alphabet based on key index
    paramters:
    text -- cipher text string
    key -- plain text string of len 26
    returns decrypted plain text version of the cipher text
    '''
    # check validity of the inputs
    if key == '' or text == '':
        raise ValueError('both text and key values must be given')

    if not isinstance(key, str):
        raise TypeError('key must be a string')

    if not isinstance(text, str):
        raise TypeError('text must be a string')

    try:
        text = utils.strip(text)
        text = utils.process_text(text)
        text = utils.latin_caps(text)
        key = utils.strip(key)
        key = utils.process_text(key)
        key = utils.latin_caps(key)
    except ValueError:
        raise ValueError('text and key must only contain valid letters')

    if not utils.check_full(key):
        raise ValueError('key must contain only each letter of alphabet once')

    # alphabet and cipher
    alphabet_str = string.ascii_uppercase
    alphabet = list(alphabet_str)
    plain = ''
    # iterate through text and replace letter by key index
    for letter in text:
        if letter in alphabet:
            plain = plain + alphabet[key.index(letter)]
        else:
            raise ValueError('text and key must only contain valid letters')
    return plain

示例#12

0

显示文件

def main(args):
    # Load the arguments.
    model_dir = os.path.dirname(args.model_path)
    params = Dict2Obj(
        json.load(open(os.path.join(model_dir, "args.json"), "r")))

    # Config logging
    log_format = '%(levelname)-8s %(message)s'
    logfile = os.path.join(model_dir, 'eval.log')
    logging.basicConfig(filename=logfile,
                        level=logging.INFO,
                        format=log_format)
    logging.getLogger().addHandler(logging.StreamHandler())
    logging.info(json.dumps(args.__dict__))
    # Load vocabulary wrapper.
    vocab = load_vocab(params.vocab_path)

    # Build data loader
    logging.info("Building data loader...")

    # Load GloVe embedding.
    if params.use_glove:
        embedding = get_glove_embedding(params.embedding_name, 300, vocab)
    else:
        embedding = None

    # Processing input text
    logging.info("Processing input text...")
    text, length = process_text(args.text, vocab, max_length=20)
    d_text = text

    logging.info("Done")
    # Build the models
    logging.info('Creating IQ model...')
    model = Classifier(len(vocab),
                       embedding_dim=params.embedding_dim,
                       embedding=embedding,
                       hidden_dim=params.num_hidden_nodes,
                       output_dim=params.num_output_nodes,
                       num_layers=params.num_layers,
                       bidirectional=params.bidirectional,
                       dropout=params.dropout,
                       rnn_cell=params.rnn_cell)

    logging.info("Done")

    logging.info("Loading model.")
    model.load_state_dict(
        torch.load(args.model_path + "model-tf-" + args.state + ".pkl"))

    # Setup GPUs.
    if torch.cuda.is_available():
        logging.info("Using available GPU...")
        model.cuda()

    predict(model, d_text)

示例#13

0

显示文件

文件： csv_data_uploader.py 项目： chabanovsky/so-comment-moderator

    def read_dumped_comments(self, filename='comments.csv'):
        data = list()

        def to_bool(field):
            return (str(field).lower() == 'true')

        def to_int(field, default_value=-1):
            try:
                value = int(field)
                return value
            except ValueError:
                return default_value

        def to_date(field, default_value=None):
            date_format = "%Y-%m-%d %H:%M:%S"
            try:
                value = datetime.strptime(field, date_format),
                return value
            except ValueError:
                return default_value

        with open(self.prefix + filename, 'rt', encoding="utf8") as csvfile:
            csv_reader = csv.reader(csvfile, delimiter=',')
            for row in csv_reader:
                comment_id, question_id, answer_id, post_author_id, post_score, title, body, creation_date, author_id, author_name, diff_with_post, verified, is_rude, verified_user_id, added, analysed, looks_rude, skipped = row

                data_item = {
                    "comment_id": int(comment_id),
                    "question_id": int(question_id),
                    "answer_id": int(answer_id),
                    "post_author_id": int(post_author_id),
                    "post_score": int(post_score),
                    "title": title,
                    "body": body,
                    "processed_body": process_text(body),
                    "creation_date": to_date(creation_date),
                    "author_id": int(author_id),
                    "author_name": author_name,
                    "diff_with_post": int(diff_with_post),
                    "verified": to_date(verified),
                    "is_rude": to_bool(is_rude),
                    "verified_user_id": to_int(verified_user_id),
                    "added": to_date(added),
                    "analysed": to_date(analysed),
                    "looks_rude": to_bool(looks_rude),
                    "skipped": to_date(skipped)
                }
                data.append(data_item)

        return data

示例#14

0

显示文件

def count(ngram, hash_size, doc_id):
    """Fetch the text of a document and compute hashed ngrams counts."""
    global DOC2IDX
    row, col, data = [], [], []

    # Get ngrams after tokenizing and processing (i.e. stopword/punctuation filtering)
    ngrams = utils.process_text(utils.normalize(fetch_text(doc_id)), 
                                stopwords=True, stem=True, ngram=ngram)

    # Hash ngrams and count occurences
    counts = Counter([utils.hash(gram, hash_size) for gram in ngrams])

    # Return in sparse matrix data format.
    row.extend(counts.keys())
    col.extend([DOC2IDX[doc_id]] * len(counts))
    data.extend(counts.values())
    return row, col, data

示例#15

0

显示文件

文件： extract_figure_text.py 项目： saarku/fig-explorer

def get_figure_mentions_by_lines(grobid_article_dir, figure_num, length,
                                 mentions_dict):
    """Get the text of figure mentions in the article using sentence windows.

    Args:
      grobid_article_dir: (string) the directory the processed Grobid file of the article.
      figure_num: (string) the figure number.
      length: (int) the number of lines that should surround a figure mention.
      mentions_dict: (dictionary) the locations of sentences in the texts where the mentions are.

    Returns:
      (list). The merged representation.
    """
    mentions_text = ''
    doc_txt = ''
    with open(grobid_article_dir, 'r') as input_file:
        for line in input_file:
            line = line.rstrip('\n')
            doc_txt += line + ' '
        doc_txt = doc_txt.rstrip(' ')

    doc_txt = utils.process_text(doc_txt)
    sentences = sent_tokenize(doc_txt)

    if str(figure_num) in mentions_dict.keys():
        all_summary_ids = []
        for sentence_id in mentions_dict[str(figure_num)]:

            summary_ids = []
            for i in range(length, 0, -1):
                summary_ids += [sentence_id - i]
            summary_ids += [sentence_id]
            for i in range(1, length + 1):
                summary_ids += [sentence_id + i]
            all_summary_ids += [summary_ids]
        all_summary_ids = merge_texts(all_summary_ids)

        for summary_ids in all_summary_ids:
            for i in summary_ids:
                if i < 0 or i >= len(sentences):
                    continue
                mentions_text += ' ' + sentences[i]
            mentions_text += ' ... '
    mentions_text = mentions_text.rstrip(' ...') + '\"'
    return str(mentions_text)

示例#16

0

显示文件

文件： gen_gta.py 项目： xcmyz/tacotron2.xcmyz

def generator(model):
    os.makedirs("gta", exist_ok=True)
    with torch.no_grad():
        text = process_text(os.path.join("data", "train.txt"))
        start = time.perf_counter()
        for i in tqdm(range(len(text))):
            mel_gt_name = os.path.join(hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (i + 1))
            mel_gt_target = np.load(mel_gt_name)
            character = text[i][0:len(text[i])-1]
            character = np.array(text_to_sequence(character, hparams.text_cleaners))
            character = torch.stack([torch.from_numpy(character)]).long().to(device)
            length = torch.Tensor([character.size(1)]).long().to(device)
            mel_gt_target = torch.stack([torch.from_numpy(mel_gt_target.T)]).float().to(device)
            mel_gta = model.gta(character, mel_gt_target, length)
            np.save(os.path.join("gta", "ljspeech-mel-%05d.npy" % (i + 1)), mel_gta.cpu()[0].numpy())

        end = time.perf_counter()
        print("cost {:.2f}s to generate gta data.".format(end - start))

示例#17

0

显示文件

def main():
    path = os.path.join("data", "LJSpeech-1.1")
    preprocess_ljspeech(path)

    text_path = os.path.join(path, "metadata.csv")
    texts = process_text(text_path)

    if not os.path.exists(hp.alignment_path):
        os.mkdir(hp.alignment_path)

    num = 0
    for ind, line in enumerate(texts[num:]):
        parts = line.strip().split('|')
        phones = parts[4]
        # sumLen=parts[5];
        mel_gt_name = os.path.join(hp.mel_ground_truth,
                                   "ljspeech-mel-%05d.npy" % (ind + num + 1))
        mel_gt_target = np.load(mel_gt_name)
        D = np.array(phones.split(' ')).astype(int)
        if (ind % 100 == 0):
            print("calc number:", ind, D.sum(), parts[4],
                  mel_gt_target.shape[0], line)

        if (D.sum() > mel_gt_target.shape[0]):
            print("phonelen error:", D.sum(), mel_gt_target.shape[0], line)
            exit(0)

        if (abs(mel_gt_target.shape[0] - D.sum()) > 3):
            print("phonelen error:", D.sum(), mel_gt_target.shape[0], line)
            exit(0)

        if (D.sum() < mel_gt_target.shape[0]):
            gap = mel_gt_target.shape[0] - D.sum()
            fron = int(gap / 2)
            end = gap - fron
            D[0] = D[0] + fron
            D[len(D) - 1] = D[len(D) - 1] + end

        np.save(os.path.join(hp.alignment_path,
                             str(ind + num) + ".npy"),
                D,
                allow_pickle=False)

示例#18

0

显示文件

文件： final_runnable.py 项目： shuntucodes/fever-docker-project-hw3

def word_overlap_phi(claim, evidence):
    """Basis for features for the words in both the premise and hypothesis.
    This tends to produce very sparse representations.

    Parameters
    ----------
    claim : a string
    evidence : a list of sentences

    Returns
    -------
    defaultdict
       Maps each word in both claim and evidence to 1.

    """
    sents = []
    for sent in evidence:
        sents.extend(utils.process_sent(sent))
    overlap = set([w1 for w1 in utils.process_text(claim) if w1 in sents])
    return Counter(overlap)

示例#19

0

显示文件

文件： word2vec.py 项目： RBeaudet/Parallelizing_Word2Vec

    def fit(self, n_iter, num_proc=2):
        '''
        Trains the Word2vec
        :param n_iter: (int)
        :param num_proc: (int) number of parallel threads
        '''

        self.n_iter = n_iter
        self.num_proc = num_proc
        self.X, self.y, self.word_to_index, \
        self.index_to_word, self.occurence = process_text(text=self.text,
                                                          vocab_size=self.vocab_size,
                                                          window_size=self.window_size)

        # Training
        self.M_in, self.M_out, self.loss, self.process_time = Hogwild(self.X, self.y, self.n_iter, self.vocab_size,
                                                                      self.embedding_size,
                                                                      self.learning_rate,
                                                                      self.window_size, self.n_negative,
                                                                      self.occurence, num_proc=self.num_proc)

示例#20

0

显示文件

    def make_site_comment_params(comment, info):
        comment_id, post_id, body, creation_date, author_id, author_name = comment
        question_id, answer_id, post_author_id, post_author_name, score, title, post_creation_date = info

        return {
            "comment_id": comment_id,
            "question_id": question_id,
            "answer_id": answer_id,
            "post_author_id": post_author_id,
            "post_score": score,
            "title": title,
            "body": body,
            "processed_body": process_text(body),
            "creation_date": creation_date,
            "author_id": author_id,
            "author_name": author_name,
            "verified": None,
            "is_rude": False,
            "diff_with_post":
            (creation_date - post_creation_date).total_seconds()
        }

示例#21

0

显示文件

文件： preprocess.py 项目： xzm2004260/LightSpeech

def main():
    # path = os.path.join("data", "LJSpeech-1.1")
    # preprocess_ljspeech(path)

    text_path = os.path.join("data", "train.txt")
    texts = process_text(text_path)

    if not os.path.exists(hp.cemb_path):
        os.mkdir(hp.cemb_path)

    if not os.path.exists(hp.alignment_path):
        os.mkdir(hp.alignment_path)

    if not os.path.exists(hp.mel_tacotron2):
        os.mkdir(hp.mel_tacotron2)

    tacotron2 = get_Tacotron2()
    # wave_glow = get_WaveGlow()

    num = 0
    for ind, text in enumerate(texts[num:]):
        print(ind)
        # mel_name = os.path.join(hp.mel_ground_truth,
        #                         "ljspeech-mel-%05d.npy" % (ind+1))
        # mel_target = np.load(mel_name)
        character = text[0:len(text) - 1]
        mel_tacotron2, cemb, D = load_data_from_tacotron2(character, tacotron2)

        np.save(os.path.join(hp.mel_tacotron2,
                             str(ind + num) + ".npy"),
                mel_tacotron2,
                allow_pickle=False)
        np.save(os.path.join(hp.cemb_path,
                             str(ind + num) + ".npy"),
                cemb,
                allow_pickle=False)
        np.save(os.path.join(hp.alignment_path,
                             str(ind + num) + ".npy"),
                D,
                allow_pickle=False)

示例#22

0

显示文件

文件： extract_figure_text.py 项目： saarku/fig-explorer

def get_figure_ids_line_pos(grobid_article_dir):
    """Get the line numbers of lines in the text that have the figure mentions.

    Args:
      grobid_article_dir: (string) the directory the processed grobid file of the article.
    Returns:
      (dictionary of lists). Mapping from figure identifiers to the list of indexes in the text where the mentions are.
    """
    figures_dict = {}
    doc_txt = ''
    with open(grobid_article_dir, 'r') as input_file:
        for line in input_file:
            line = line.rstrip('\n')
            doc_txt += line + ' '
        doc_txt = doc_txt.rstrip(' ')

    doc_txt = utils.process_text(doc_txt)
    sentences = sent_tokenize(doc_txt)

    for sentence_id, sentence in enumerate(sentences):
        if '<figcaptions>' in sentence:
            break

        words = sentence.split()
        for word_id, word in enumerate(words):
            if word.split('.')[0].lower().lstrip('(').rstrip(
                    ')') in figure_formats:
                word = word.replace('III', '3').replace('II',
                                                        '2').replace('I', '1')
                if len(word.split('.')) > 1:
                    word_length = len(word.split('.'))
                    fig_num = utils.extract_number('.'.join(
                        word.split('.')[1:word_length]))
                else:
                    fig_num = utils.extract_number(words[word_id + 1].replace(
                        'III', '3').replace('II', '2').replace('I', '1'))

                figures_dict[fig_num] = figures_dict.get(fig_num,
                                                         []) + [sentence_id]
    return figures_dict

示例#23

0

显示文件

文件： runnable.py 项目： shuntucodes/fever

def word_cross_product_phi(claim, evidence):
    """Basis for cross-product features. This tends to produce pretty 
    dense representations.
    
    Parameters
    ----------
    claim : a string
    evidence : a list of sentences
        
    Returns
    -------
    defaultdict
        Maps each (w1, w2) in the cross-product of words in claim and 
        evidence to its count. This is a multi-set cross-product
        (repetitions matter).
    
    """
    sents = []
    for sent in evidence:
        sents.extend(utils.process_sent(sent))
    return Counter([(w1, w2)
                    for w1, w2 in product(utils.process_text(claim), sents)])

示例#24

0

显示文件

文件： dataset.py 项目： entn-at/transformerTTS-1

def get_data_to_buffer():
    buffer = list()
    text = process_text(os.path.join("data", "train.txt"))

    start = time.perf_counter()
    for i in tqdm(range(len(text))):

        mel_gt_name = os.path.join(hparams.mel_ground_truth,
                                   "ljspeech-mel-%05d.npy" % (i + 1))
        mel_gt_target = np.load(mel_gt_name)
        character = text[i][0:len(text[i]) - 1]
        character = np.array(text_to_sequence(character,
                                              hparams.text_cleaners))

        character = torch.from_numpy(character)
        mel_gt_target = torch.from_numpy(mel_gt_target)

        buffer.append({"text": character, "mel_target": mel_gt_target})

    end = time.perf_counter()
    print("cost {:.2f}s to load all data into buffer.".format(end - start))

    return buffer

示例#25

0

显示文件

    def clusteringtask():
        print('Classification endpoint hit')
        start = time()

        try:
            data = request.json['text']
            if type(data) == str:
                data = [data]

            data = process_text(data)
            pred = model.predict_batch(data)

            return jsonify({
                'message': 'Classification successful',
                'classification': [label_map(int(x[0][0])) for x in pred],
                'enum': [int(x[0][0]) for x in pred],
                'confidence': [x[0][1] for x in pred],
                'time': time() - start
            }), 200
        except Exception as e:
            tb = traceback.format_exc()
            print(f"TRACEBACK:\n\n{tb}\n")
            return jsonify({'message': str(e), 'stacktrace': str(tb)}), 500

示例#26

0

显示文件

文件： caesar.py 项目： rpassas/python_projects

def decrypt(text, key):
    '''
    Function -- decrypt
    decrypts cipher text by shifting letter indices to the left in the
    alphabet per the key (reversed if negative)
    paramters:
    text -- cipher text string
    key -- integer indicating magnitude and direction of index shift
    returns decrypted plain text version of the cipher text
    '''
    # check validity of inputs
    if key == '' or text == '':
        raise ValueError('both text and key values must be given')

    if not isinstance(key, int):
        raise TypeError('key must be an integer')

    if not isinstance(text, str):
        raise TypeError('text must be a string')

    try:
        text = utils.strip(text)
        text = utils.process_text(text)
    except ValueError:
        raise ValueError('text must only contain latin letters')

    key = key % 26

    # decrypt the string
    plain = ''
    a_index = list(range(0, 26))
    for x in text:
        if ord(x) >= 65 and ord(x) <= 90:
            plain += chr(a_index[((ord(x) - 65) - key) % 26] + 65)
        else:
            raise ValueError('text must only contain latin letters')
    return plain

示例#27

0

显示文件

文件： final_runnable.py 项目： shuntucodes/fever-docker-project-hw3

def word_cross_product_phi(claim, evidence):
    """Basis for cross-product features. This tends to produce pretty
    dense representations.
#     Parameters
#     ----------
#     claim : a string
#     evidence : a list of sentences
#
#     Returns
#     -------
#     defaultdict
#         Maps each (w1, w2) in the cross-product of words in claim and
#         evidence to its count. This is a multi-set cross-product
#         (repetitions matter).
#
#     """
    local_sents = []
    for sent in evidence:

        local_sents.extend(utils.process_sent(sent))
        print("Sent: " + str(local_sents))
    return Counter([
        (w1, w2) for w1, w2 in product(utils.process_text(claim), local_sents)
    ])

示例#28

0

显示文件

 def word_list_maker(words):
     return [
         word for word in process_text(u' '.join(words)).split(u' ')
         if len(word.strip()) > 0
     ]

示例#29

0

显示文件

parser.add_argument('--model',
                    default="Seq2Seq",
                    type=str,
                    help='choose a model: Seq2Seq')
args = parser.parse_args()

if __name__ == '__main__':
    dataset = 'Couplets'  # 数据集
    model_name = args.model  # Seq2Seq

    x = import_module('models.' +
                      model_name)  #一个函数运行需要根据不同项目的配置，动态导入对应的配置文件运行。
    config = x.Config(dataset)  #进入到对应模型的__init__方法进行参数初始化
    start_time = time.time()
    print("Loading data...")
    input_texts, target_texts, input_characters, target_characters = process_text(
        config)

    num_encoder_tokens, num_decoder_tokens, max_encoder_seq_length, max_decoder_seq_length, input_token_index, target_token_index = bulid_token_index(
        input_texts, target_texts, input_characters, target_characters)

    encoder_input_data, decoder_input_data, decoder_target_data = build_dataset(
        input_texts, target_texts, num_encoder_tokens, num_decoder_tokens,
        max_encoder_seq_length, max_decoder_seq_length, input_token_index,
        target_token_index)

    config.num_encoder_tokens = num_encoder_tokens
    config.num_decoder_tokens = num_decoder_tokens
    config.max_encoder_seq_length = max_encoder_seq_length
    config.max_decoder_seq_length = max_decoder_seq_length
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

示例#30

0

显示文件

文件： dataset.py 项目： xzm2004260/LightSpeech

 def __init__(self):
     self.text = process_text(os.path.join("data", "train.txt"))