Python tokenize_and_clean示例，custom_tokenizer.tokenize_and_clean Python示例

示例#1

0

显示文件

    def __init__(self, file, label_type, threshold):

        with open('public_data/vocab/word2id.pkl', 'rb') as infile:
            self.word2id = pickle.load(infile)

        with open('public_data/stats/stats_train.pkl', 'rb') as stats:
            stats = pickle.load(stats)
        self.orig_labels = [label for label, freq in stats["DISTR_" + label_type].items() if freq >= threshold]
        self.label_type = label_type

        with open("public_data/inputs/binary_%s_%s_%s_%s.pkl" % (file, label_type, str(threshold), "triplet"),'rb') as indata:
            self.df = pd.read_pickle(indata, compression=None)[:TRAIN_LIMIT]

        self.sequences_A = [torch.LongTensor([self.word2id.get(word, 1) for word in tokenize_and_clean(text)])
                     for text in self.df["TEXT_A"]]
        self.sequences_A = pad_sequence(self.sequences_A, batch_first=True)

        self.sequences_B = [torch.LongTensor([self.word2id.get(word, 1) for word in tokenize_and_clean(text)])
                       for text in self.df["TEXT_B"]]
        self.sequences_B = pad_sequence(self.sequences_B, batch_first=True)

        self.sequences_C = [torch.LongTensor([self.word2id.get(word, 1) for word in tokenize_and_clean(text)])
                       for text in self.df["TEXT_C"]]
        self.sequences_C = pad_sequence(self.sequences_C, batch_first=True)

        self.targets = torch.FloatTensor(np.ones((len(self.df))))

示例#2

0

显示文件

文件： sim_search.py 项目： Janinanu/Neural_Information_Retrieval

def compute_tfidf(collection, queries, idf_base):
    """
    Computes tfidf vectors
    :param collection: all collection documents
    :param queries: all query documents
    :param idf_base: store idf with identifier "above_threshold", "below_threshold", "all"
    :return: document vectors for documents in collection and queries
    """

    vectorizer = TfidfVectorizer(analyzer='word',
                                 tokenizer=dummy_fun,
                                 preprocessor=dummy_fun,
                                 token_pattern=None)

    texts = list(collection["TEXT"].apply(lambda x: tokenize_and_clean(x)))
    collection = vectorizer.fit_transform(texts)
    collection = normalize(csr_matrix(collection, dtype=np.float32).toarray(),
                           copy=True)

    texts = list(queries["TEXT"].apply(lambda x: tokenize_and_clean(x)))
    queries = vectorizer.transform(texts)
    queries = normalize(csr_matrix(queries, dtype=np.float32).toarray(),
                        copy=True)

    max_idf = max(vectorizer.idf_)
    word2weight = defaultdict(lambda: max_idf,
                              [(w, vectorizer.idf_[i])
                               for w, i in vectorizer.vocabulary_.items()])
    with open("public_data/vocab/tf_idf_word2weight_%s.pkl" % idf_base,
              'wb') as out:
        pickle.dump(dict(word2weight), out, protocol=4)

    return collection, queries

示例#3

0

显示文件

文件： train_embs_helpers.py 项目： Janinanu/Neural_Information_Retrieval

def make_txt_file(file):
    """
    Turns documents from dataframe into txt file as input to unsupervised embedding training
    :param file: pickled dataframe
    """
    with open(file, 'rb') as infile:
        df = pd.read_pickle(infile, compression=None)
    texts = df["TEXT"].apply(lambda x: tokenize_and_clean(x))
    with open("public_data/inputs/data.txt", "w", encoding="utf-8") as out:
        for text in texts:
            out.write(" ".join(text) + "\n")

示例#4

0

显示文件

    def get_tokens_vocab(self):

        print(
            str(datetime.datetime.now()).split('.')[0],
            "Extracting tokens and vocab...")
        all_tokens = []
        seq_lens = []
        tokens = self.dataset["TEXT"].apply(lambda x: tokenize_and_clean(x))

        for token_list in tokens:
            all_tokens.extend(token_list)
            seq_lens.append(len(token_list))

        self.stats["TOKENS"] = all_tokens
        self.stats["VOCAB"] = list(set(all_tokens))
        self.stats["TOKEN_FREQS"] = Counter(all_tokens)
        self.stats["AVG_TEXT_LEN"] = sum(seq_lens) / len(seq_lens)

示例#5

0

显示文件

    def __init__(self, file, label_type, threshold, which_labels):
        self.label_type = label_type

        with open('public_data/vocab/word2id.pkl', 'rb') as infile:
            self.word2id = pickle.load(infile)

        with open('public_data/inputs/%s.pkl' %file, 'rb') as infile:
            data = pd.read_pickle(infile, compression=None)
        if file == "train":
            self.df = data[:TRAIN_LIMIT]
        elif file in ["valid", "test"]:
            self.df = data[:VALID_LIMIT]

        with open('public_data/stats/stats_train.pkl', 'rb') as stats:
            stats = pickle.load(stats)

        if which_labels in ["above_threshold", "below_threshold"]:

            if which_labels == "above_threshold":
                labels = [label for label, freq in
                          stats["DISTR_" + label_type].items() if freq >= threshold]

            elif which_labels == "below_threshold":
                labels = [label for label, freq in
                          stats["DISTR_" + label_type].items() if freq < threshold]

            self.df = self.df[self.df[label_type].isin(labels)]

        if file == "train":
            self.df = self.df[:CORPUS_LIMIT]
        elif file in ["valid", "test"]:
            self.df = self.df[:QUERY_LIMIT]

        self.sequences = [torch.LongTensor([self.word2id.get(word, 1) for word in tokenize_and_clean(text)])
                        for text in self.df["TEXT"]]
        self.sequences = pad_sequence(self.sequences, batch_first=True)

示例#6

0

显示文件

    def __init__(self, file, label_type, threshold):

        with open('public_data/vocab/word2id.pkl', 'rb') as infile:
            self.word2id = pickle.load(infile)

        with open('public_data/stats/stats_train.pkl', 'rb') as stats:
            stats = pickle.load(stats)
        d = stats["DISTR_" + label_type]
        labels = [label for label, freq in sorted(d.items(), key=lambda item: item[1], reverse=True)
                  if freq >= threshold]
        self.label2id = {l: i for i, l in enumerate(labels)}

        with open('public_data/inputs/%s.pkl' %file, 'rb') as indata:
            if file == "train":
                data = pd.read_pickle(indata, compression=None)[:TRAIN_LIMIT]
            elif file in ["valid", "test"]:
                data = pd.read_pickle(indata, compression=None)[:VALID_LIMIT]
        self.df = data[data[label_type].isin(labels)]

        self.sequences = [torch.LongTensor([self.word2id.get(word, 1) for word in tokenize_and_clean(text)])
                          for text in self.df["TEXT"]]
        self.sequences = pad_sequence(self.sequences, batch_first=True)

        self.labels = torch.LongTensor([self.label2id[label] for label in self.df[label_type]])

示例#7

0

显示文件

文件： evaluation.py 项目： Janinanu/Neural_Information_Retrieval

    def get_collection_and_queries(self):

        if self.experiment == "TF-IDF":
            collection, queries = compute_tfidf(collection=self.collection.df,
                                                queries=self.queries.df,
                                                idf_base=self.which_labels)

        elif self.experiment == "BM25":
            queries = list(
                self.queries.df["TEXT"].apply(lambda x: tokenize_and_clean(x)))
            collection = list(self.collection.df["TEXT"].apply(
                lambda x: tokenize_and_clean(x)))

        elif self.experiment == "RANDOM_DOC":
            queries = normalize(np.random.uniform(
                low=-0.1, high=0.1,
                size=(len(self.queries), 300)).astype("float32"),
                                axis=1)
            collection = normalize(np.random.uniform(
                low=-0.1, high=0.1,
                size=(len(self.collection), 300)).astype("float32"),
                                   axis=1)

        else:

            collection_loader = DataLoader(self.collection,
                                           batch_size=64,
                                           shuffle=False)
            queries_loader = DataLoader(self.queries,
                                        batch_size=64,
                                        shuffle=False)

            # without additional classifier
            if "mucl" not in self.experiment and "pair" not in self.experiment and "triplet" not in self.experiment:

                queries = combine_avg_doc_embs(sequences=queries_loader,
                                               model=self.experiment,
                                               idf_base="all")
                collection = combine_avg_doc_embs(sequences=collection_loader,
                                                  model=self.experiment,
                                                  idf_base="all")

            else:  # with classifier

                if self.doc_repr == "avg":
                    queries = combine_avg_doc_embs(sequences=queries_loader,
                                                   model=self.model,
                                                   idf_base=self.which_labels)
                    collection = combine_avg_doc_embs(
                        sequences=collection_loader,
                        model=self.model,
                        idf_base=self.which_labels)

                elif self.doc_repr in ["hidden", "linear", "softmax"]:
                    collection = extract_from_model(
                        doc_repr=self.doc_repr,
                        sequences=collection_loader,
                        model=self.model)
                    queries = extract_from_model(doc_repr=self.doc_repr,
                                                 sequences=queries_loader,
                                                 model=self.model)

        return collection, queries