示例#1
0
    def create_fields(
        header: List[str],
        to_lower: bool = False,
        sen_column: str = "sen",
        tokenize_columns: Optional[List[str]] = None,
        convert_numerical: bool = False,
        tokenizer: Optional[PreTrainedTokenizer] = None,
    ) -> List[Tuple[str, Field]]:
        tokenize_columns = tokenize_columns or [sen_column]

        pipeline = None
        if convert_numerical:

            def preprocess_field(s: Union[str, int]) -> Union[str, int]:
                return int(s) if (isinstance(s, str) and s.isdigit()) else s

            pipeline = Pipeline(convert_token=preprocess_field)

        fields = []

        for column in header:
            if column in tokenize_columns:
                field = Field(batch_first=True, include_lengths=True, lower=to_lower)
                if tokenizer is not None:
                    attach_tokenizer(field, tokenizer)
            else:
                field = RawField(preprocessing=pipeline)
                field.is_target = False

            fields.append((column, field))

        return fields
    def preprocess(self, x):
        """Load a single example using this field, tokenizing if necessary.

        If the input is a Python 2 `str`, it will be converted to Unicode
        first. If `sequential=True`, it will be tokenized. Then the input
        will be optionally lowercased and passed to the user-provided
        `preprocessing` Pipeline."""
        if (six.PY2 and isinstance(x, six.string_types) and
                not isinstance(x, six.text_type)):  # never
            x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
        if self.sequential and isinstance(x, six.text_type):  # never
            x = self.tokenize(x.rstrip('\n'))
        if self.lower:
            x = [Pipeline(six.text_type.lower)(xx) for xx in x]
        if self.preprocessing is not None:
            return self.preprocessing(x)
        else:
            return x
示例#3
0
    def preprocess(self, x):
        """Load a single example using this field, tokenizing if necessary.

        If the input is a Python 2 `str`, it will be converted to Unicode
        first. If `sequential=True`, it will be tokenized. Then the input
        will be optionally lowercased and passed to the user-provided
        `preprocessing` Pipeline."""
        if (six.PY2 and isinstance(x, six.string_types)
                and not isinstance(x, six.text_type)):
            x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
        if isinstance(x, six.text_type):
            x = self.tokenize(x.rstrip('\n'))
        if self.lower:
            x = Pipeline(six.text_type.lower)(x)
        # The Pipeline that will be applied to examples using this field after
        # tokenizing but before numericalizing. Many Datasets replace this
        # attribute with a custom preprocessor. Default: None.
        if self.preprocessing is not None:
            return self.preprocessing(x)
        else:
            return x
示例#4
0
    def preprocess(self, x):
        """Load a single example using this field, tokenizing if necessary.

        If the input is a Python 2 `str`, it will be converted to Unicode
        first. If `sequential=True`, it will be tokenized. Then the input
        will be optionally lowercased and passed to the user-provided
        `preprocessing` Pipeline."""
        if (six.PY2 and isinstance(x, six.string_types)
                and not isinstance(x, six.text_type)):
            x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)

        if self.lower:
            x = Pipeline(six.text_type.lower)(x)

        if self.sequential and isinstance(x, six.text_type):
            doc = []
            sents = x.strip().split(' <eos> ')
            for sent in sents:
                doc.append(sent.strip().split())
            return doc
        else:
            raise RuntimeError('text_type')
示例#5
0
def get_E2E_loaders(path, valid=0.1, batch_size=32):
    utterance = data.Field(tokenize=tokenizer, lower=True)
    label = data.Field(sequential=False,
                       postprocessing=Pipeline(convert_token=convert_token))
    id = data.Field(use_vocab=False, sequential=False)
    fields = [('id', id), ('turn1', utterance), ('turn2', utterance),
              ('turn3', utterance), ('label', label)]

    train = data.TabularDataset('{}/train.txt'.format(path),
                                format='tsv',
                                fields=fields,
                                skip_header=True)
    valid = data.TabularDataset('{}/valid.txt'.format(path),
                                format='tsv',
                                fields=fields,
                                skip_header=True)

    test = data.TabularDataset('{}/test.txt'.format(path),
                               format='tsv',
                               fields=fields,
                               skip_header=True)
    vectors = vocab.Vectors(name='emojiplusglove.txt',
                            cache='/media/backup/nlp-cic/DialogueRNN/')
    utterance.build_vocab(train, valid, test, vectors=vectors)
    #utterance.build_vocab(train, valid, test, vectors='glove.840B.300d')
    label.build_vocab(train)
    train_iter = BucketIterator(train,
                                train=True,
                                batch_size=batch_size,
                                sort_key=lambda x: len(x.turn3),
                                device=torch.device(0))
    valid_iter = BucketIterator(valid,
                                batch_size=batch_size,
                                sort_key=lambda x: len(x.turn3),
                                device=torch.device(0))
    test_iter = BucketIterator(test,
                               batch_size=batch_size,
                               sort_key=lambda x: len(x.turn3),
                               device=torch.device(0))
    return train_iter, valid_iter, test_iter,\
            utterance.vocab.vectors if not args.cuda else utterance.vocab.vectors.cuda(),\
            label.vocab.itos
示例#6
0
def get_E2E_loaders(path, valid=0.1, batch_size=32):
    utterance = data.Field(tokenize=tokenizer, lower=True)
    label = data.Field(sequential=False,
                       postprocessing=Pipeline(convert_token=convert_token))
    id = data.Field(use_vocab=False, sequential=False)
    fields = [('id', id), ('turn1', utterance), ('turn2', utterance),
              ('turn3', utterance), ('label', label)]

    train, valid = data.TabularDataset('{}/train.txt'.format(path),
                                       format='tsv',
                                       fields=fields,
                                       skip_header=True).split(1 - valid)

    test = data.TabularDataset('{}/devwithoutlabels.txt'.format(path),
                               format='tsv',
                               fields=fields[:-1],
                               skip_header=True)

    utterance.build_vocab(train, valid, test, vectors='glove.840B.300d')
    label.build_vocab(train)
    train_iter = BucketIterator(train,
                                train=True,
                                batch_size=batch_size,
                                sort_key=lambda x: len(x.turn3),
                                device=torch.device(0))
    valid_iter = BucketIterator(valid,
                                batch_size=batch_size,
                                sort_key=lambda x: len(x.turn3),
                                device=torch.device(0))
    test_iter = BucketIterator(test,
                               batch_size=batch_size,
                               sort_key=lambda x: len(x.turn3),
                               device=torch.device(0))
    return train_iter, valid_iter, test_iter,\
            utterance.vocab.vectors if not args.cuda else utterance.vocab.vectors.cuda(),\
            label.vocab.itos
nltk.download("punkt")

# sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
TEMPLATE_DIR = os.path.abspath("./templates")
STATIC_DIR = os.path.abspath("./static")

app = Flask(__name__, template_folder=TEMPLATE_DIR, static_folder=STATIC_DIR)

# original stuff:
# app = Flask(__name__)
# , static_url_path= '', static_folder= './static/vendor'
# app._static_folder = './static/vendor'
# bootstrap = Bootstrap(app)
RESULT = None

pre_pipeline = Pipeline(lemmatize)
pre_pipeline.add_before(preprocessing)
TEXT = Field(
    sequential=True,
    tokenize=word_tokenize,
    lower=True,
    stop_words=STOPWORDS,
    preprocessing=pre_pipeline,
)
LABELS = ["Neutral", "Negative", "Positive"]
VOCAB = {}
with open("./models/vocab.pkl", "rb") as f:
    VOCAB = pickle.load(f)

best_config = {
    "hidden_size": 302,
示例#8
0
def import_corpus(
    path: str,
    header: Optional[List[str]] = None,
    header_from_first_line: bool = False,
    to_lower: bool = False,
    vocab_path: Optional[str] = None,
    vocab_from_corpus: bool = False,
    sen_column: str = "sen",
) -> TabularDataset:

    """ Imports a corpus from a path.

    The corpus can either be a raw string or a pickled dictionary.
    Outputs a `Corpus` type, that is used throughout the library.

    The raw sentence is assumed to be labeled `sen` or `sent`
    Sentences can possibly be labeled, which are assumed to be labeled
    by a `labels` tag.

    Parameters
    ----------
    path : str
        Path to corpus file
    header : List[str], optional
        Optional list of attribute names of each column, if not provided
        all lines will be considered to be sentences,  with the
        attribute name "sen".
    to_lower : bool, optional
        Transform entire corpus to lower case, defaults to False.
    header_from_first_line : bool, optional
        Use the first line of the corpus as the attribute names of the
        corpus.
    vocab_path : str, optional
        Path to the model vocabulary, which should a file containing a
        vocab entry at each line.
    vocab_from_corpus : bool, optional
        Create a new vocabulary from the tokens of the corpus itself.
        If set to True `vocab_path` does not need to be provided.
        Defaults to False.
    sen_column : str, optional
        Name of the corpus column containing the raw sentences.
        Defaults to `sen`.

    Returns
    -------
    corpus : TabularDataset
        A TabularDataset containing the parsed sentences and optional labels
    """

    if header is None:
        if header_from_first_line:
            with open(path) as f:
                header = f.readline().strip().split("\t")
        else:
            header = ["sen"]

    assert sen_column in header, "`sen` should be part of corpus_header!"

    def preprocess(s: str) -> Union[str, int]:
        return int(s) if s.isdigit() else s

    pipeline = Pipeline(convert_token=preprocess)
    fields = {}
    for field in header:
        if field == sen_column:
            fields[field] = Field(
                batch_first=True, include_lengths=True, lower=to_lower
            )
        elif field == "labels":
            fields[field] = Field(
                use_vocab=False, tokenize=lambda s: list(map(int, s.split()))
            )
        else:
            fields[field] = RawField(preprocessing=pipeline)
            fields[field].is_target = False

    corpus = TabularDataset(
        fields=fields.items(),
        format="tsv",
        path=path,
        skip_header=header_from_first_line,
        csv_reader_params={"quotechar": None},
    )

    # The current torchtext Vocab does not allow a fixed vocab order
    if vocab_path is not None or vocab_from_corpus:
        attach_vocab(corpus, vocab_path or path, sen_column=sen_column)

    return corpus
    # -
    #Define all the variables to be read by torchtext TabularDataset
    TEXT1 = Field(sequential=True,
                  tokenize=tokenize_protein,
                  init_token=None,
                  eos_token=None,
                  pad_first=False)
    TEXT2 = Field(sequential=True,
                  tokenize=tokenize_drug,
                  init_token=None,
                  eos_token=None)
    LABEL = Field(sequential=False,
                  use_vocab=False,
                  is_target=True,
                  dtype=torch.float,
                  preprocessing=Pipeline(lambda x: float(x)))
    INDEX1 = Field(sequential=False, use_vocab=True)
    INDEX2 = Field(sequential=False, use_vocab=True)

    #Read the data and get Protein Sequence, Canonical Smiles and Pchembl_value
    datafields = [('uniprot_accession', INDEX1), ("Sequence", TEXT1),
                  ('standard_inchi_key', INDEX2), ("canonical_smiles", TEXT2),
                  ("pchembl_value", LABEL)]

    #Predict activity score for sars-cov-2 viral proteins
    #Full data is used only for the purpose of having inchikey of all compounds in train and test set and uniprot accession of all viral organisms
    full_data, data, test_data = TabularDataset.splits(
        path="../data/",
        train='all_compound_viral_interactions_for_supervised_learning.csv',
        validation=args.input1,
        #test='Test_Compound_Viral_interactions_for_Supervised_Learning.csv',
示例#10
0
class TRECQA(CastorPairDataset):
    NAME = 'trecqa'
    NUM_CLASSES = 2
    ID_FIELD = Field(sequential=False,
                     tensor_type=torch.FloatTensor,
                     use_vocab=False,
                     batch_first=True)
    AID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
    TEXT_FIELD = Field(
        batch_first=True, tokenize=lambda x: x
    )  # tokenizer is identity since we already tokenized it to compute external features
    EXT_FEATS_FIELD = Field(
        tensor_type=torch.FloatTensor,
        use_vocab=False,
        batch_first=True,
        tokenize=lambda x: x,
        postprocessing=Pipeline(lambda arr, _, train: [float(y) for y in arr]))
    LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
    VOCAB_SIZE = 0

    @staticmethod
    def sort_key(ex):
        return len(ex.sentence_1)

    def __init__(self, path):
        """
        Create a TRECQA dataset instance
        """
        super(TRECQA, self).__init__(path, load_ext_feats=True)

    @classmethod
    def splits(cls,
               path,
               train='train-all',
               validation='raw-dev',
               test='raw-test',
               **kwargs):
        return super(TRECQA, cls).splits(path,
                                         train=train,
                                         validation=validation,
                                         test=test,
                                         **kwargs)

    @classmethod
    def set_vectors(cls, field, vector_path):
        if os.path.isfile(vector_path):
            stoi, vectors, dim = torch.load(vector_path)
            field.vocab.vectors = torch.Tensor(len(field.vocab), dim)

            for i, token in enumerate(field.vocab.itos):
                wv_index = stoi.get(token, None)
                if wv_index is not None:
                    field.vocab.vectors[i] = vectors[wv_index]
                else:
                    # initialize <unk> with uniform_(-0.05, 0.05) vectors
                    field.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(
                        -0.05, 0.05)
        else:
            print("Error: Need word embedding pt file")
            exit(1)
        return field

    @classmethod
    def iters(cls,
              path,
              vectors_name,
              vectors_dir,
              batch_size=64,
              shuffle=True,
              device=0,
              pt_file=False,
              vectors=None,
              unk_init=torch.Tensor.zero_):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_dir: directory containing word vectors file
        :param batch_size: batch size
        :param device: GPU device
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param unk_init: function used to generate vector for OOV words
        :return:
        """

        train, validation, test = cls.splits(path)
        if not pt_file:
            if vectors is None:
                vectors = Vectors(name=vectors_name,
                                  cache=vectors_dir,
                                  unk_init=unk_init)
            cls.TEXT_FIELD.build_vocab(train,
                                       validation,
                                       test,
                                       vectors=vectors)
        else:
            cls.TEXT_FIELD.build_vocab(train, validation, test)
            cls.TEXT_FIELD = cls.set_vectors(
                cls.TEXT_FIELD, os.path.join(vectors_dir, vectors_name))

        cls.LABEL_FIELD.build_vocab(train, validation, test)

        cls.VOCAB_SIZE = len(cls.TEXT_FIELD.vocab)

        return BucketIterator.splits((train, validation, test),
                                     batch_size=batch_size,
                                     repeat=False,
                                     shuffle=shuffle,
                                     device=device)
示例#11
0
def parse_input_files(batch_size,
                      embedding_dim,
                      using_GPU,
                      filepath="./data/new_annot/trainsplit_holdtarg",
                      train_name="train.json",
                      dev_name="dev.json",
                      test_name="test.json",
                      has_holdtarg=False,
                      dev_batch_size=100):
    """
    Reads the file with name filename
    """
    if using_GPU:
        torch.cuda.device(0)
        print("Running on device " + str(torch.cuda.current_device()))

    print("creating fields")
    TEXT = data.Field(sequential=True,
                      use_vocab=True,
                      batch_first=True,
                      tokenize=dummy_tokenizer,
                      include_lengths=True)
    LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True)
    POLARITY = data.Field(sequential=True,
                          use_vocab=True,
                          batch_first=True,
                          tokenize=dummy_tokenizer)
    DOCID = data.Field(sequential=False,
                       use_vocab=True,
                       batch_first=True,
                       tokenize=dummy_tokenizer)
    # may not need these two?
    # HOLDER = data.Field(sequential=True, use_vocab=True, batch_first=True, tokenize=tokenizer)
    # TARGET = data.Field(sequential=True, use_vocab=True, batch_first=True, tokenize=tokenizer)

    if has_holdtarg:
        HOLDER_TARGET = data.Field(sequential=True,
                                   use_vocab=True,
                                   batch_first=True,
                                   tokenize=dummy_tokenizer)
    H_IND = data.Field(sequential=True,
                       use_vocab=False,
                       batch_first=True,
                       postprocessing=Pipeline(custom_post_inds),
                       include_lengths=True)
    T_IND = data.Field(sequential=True,
                       use_vocab=False,
                       batch_first=True,
                       postprocessing=Pipeline(custom_post_inds),
                       include_lengths=True)

    # features
    CO_OCCURRENCES = data.Field(sequential=False,
                                use_vocab=False,
                                batch_first=True)
    HOLDER_RANK = data.Field(sequential=False,
                             use_vocab=False,
                             batch_first=True)
    TARGET_RANK = data.Field(sequential=False,
                             use_vocab=False,
                             batch_first=True)
    SENT_CLASSIFY = data.Field(sequential=False,
                               use_vocab=False,
                               batch_first=True)

    data_fields = {
        'token': ('text', TEXT),
        'label': ('label', LABEL),
        #'holder': ('holder', HOLDER), 'target': ('target', TARGET),
        'polarity': ('polarity', POLARITY),
        'docid': ('docid', DOCID),
        'holder_index': ('holder_index', H_IND),
        'target_index': ('target_index', T_IND),
        'co_occurrences': ('co_occurrences', CO_OCCURRENCES),
        'holder_rank': ('holder_rank', HOLDER_RANK),
        'target_rank': ('target_rank', TARGET_RANK),
        'classify': ('sent_classify', SENT_CLASSIFY)
    }
    if has_holdtarg:
        data_fields['holder_target'] = ('holder_target', HOLDER_TARGET)

    print("parsing data from file")

    train, val, test = data.TabularDataset.splits(path=filepath,
                                                  train=train_name,
                                                  validation=dev_name,
                                                  test=test_name,
                                                  format='json',
                                                  fields=data_fields)

    print("loading word embeddings")
    TEXT.build_vocab(train, vectors="glove.6B." + str(embedding_dim) + "d")
    POLARITY.build_vocab(train)
    print(POLARITY.vocab.stoi)
    if has_holdtarg:
        HOLDER_TARGET.build_vocab(train)
        print(HOLDER_TARGET.vocab.stoi)
    DOCID.build_vocab(train, val, test)

    print("Train length = " + str(len(train.examples)))
    print("Dev length = " + str(len(val.examples)))
    print("Test length = " + str(len(test.examples)))
    #print(val.examples[0].text)

    validation_batch = min(len(val.examples), 100)
    if dev_batch_size is not None:
        validation_batch = dev_batch_size
    test_batch = min(len(test.examples), 100)

    print("splitting & batching data")
    train_iter, val_iter, test_iter = data.Iterator.splits(
        (train, val, test),
        sort_key=lambda x: len(x.text),
        repeat=False,
        batch_sizes=(batch_size, validation_batch, test_batch),
        sort_within_batch=True)

    print("Repeat = " + str(train_iter.repeat))

    return train_iter, val_iter, test_iter, TEXT, DOCID, POLARITY