Пример #1
0
def _get_nlp(language="en", constituencies=False):
    """
    Get spaCY/benepar with models by language
    """
    import spacy

    language = language.lower()
    model_name = LANGUAGE_TO_MODEL.get(language, language)

    try:
        nlp = spacy.load(model_name)
    except OSError:
        from spacy.cli import download

        download(model_name)
        nlp = spacy.load(model_name)

    if language in BENEPAR_LANGUAGES and constituencies:
        from benepar.spacy_plugin import BeneparComponent

        try:
            nlp.add_pipe(BeneparComponent(BENEPAR_LANGUAGES[language]))
        except LookupError:
            import benepar

            benepar.download(BENEPAR_LANGUAGES[language])
            nlp.add_pipe(BeneparComponent(BENEPAR_LANGUAGES[language]))
            # nlp.add_pipe(nlp.create_pipe("sentencizer"))
    return nlp
def parse_tree_features(df):
    """
    Get features which can be extracted from the parse tree of a text. 
    
    Adds features:
    NP_per_sent: NPs (noun phrase) / num of sentences
    VP_per_sent: VPs (verb phrase) / num of sentences
    PP_per_sent: PPs (prepositional phrase) / num of sentences
    SBAR_per_sent: SBARs (subordinate clause) / num of sentences
    SBARQ_per_sent: SBARQs (direct question introduced by wh-element) / num of sentences
    avg_NP_size: Average lenght of an NP
    avg_VP_size: Average lenght of an VP
    avg_PP_size: Average lenght of an PP
    avg_parse_tree: Average height of a parse Tree
    
    :param: the dataframe with the dataset
    :returns: the dataframe with the added features
    """
    
    nlp = spacy.load(SPACY_MODEL, disable=['ner'])
    nlp.add_pipe(BeneparComponent("benepar_en_small"))
    
    # parse text
    df['B_Tokens'] = df['Text'].apply(lambda x: nlp(x))
    
    # get features
    df['NP_per_sent'], df['VP_per_sent'], df['PP_per_sent'], \
    df['SBAR_per_sent'], df['SBARQ_per_sent'], df['avg_NP_size'], \
    df['avg_VP_size'], df['avg_PP_size'], df['avg_parse_tree'] = zip(*df['B_Tokens'].map(_get_parse_tree_features))
    
    # remove B_Tokens
    df.drop(columns=["B_Tokens"], inplace=True)
    
    return df
    def __init__(self,
                 lang={
                     'spacy': 'en',
                     'benepar': 'benepar_en2'
                 },
                 config=None):
        super().__init__()
        self.download = False
        # Checking if NLTK sentence and word tokenizers should be downloaded
        if not config_berkeley_nlp['benepar_sent_word_tok_downloaded']:
            spacy.load(lang['spacy'])
            config_global['config_benepar'][
                'benepar_sent_word_tok_downloaded'] = True
            self.download = True
        # Checking if parsing model should be downloaded
        if not config_berkeley_nlp['parsing_model_downloaded']:
            benepar.download(lang['benepar'])
            config_global['config_benepar']['parsing_model_downloaded'] = True
            self.download = True
        # Updating yaml file if necessary
        if self.download:
            with open("./config.yaml", "w") as f:
                yaml.dump(config_global, f)

        self.nlp = spacy.load(lang['spacy'])
        self.nlp.add_pipe(BeneparComponent(lang['benepar']))
        self.sd = StanfordDependencies.get_instance(
            backend='subprocess')  # to convert trees
        self.name_save = 'benepar'
Пример #4
0
def preprocess(data_dir, output_fn, batch, threads):
    print('preparing text')
    data = all_data(data_dir)
    print('loading parser')
    nlp = spacy.load('en_core_web_lg')
    """
    try:
        nlp = spacy.load('en_core_web_lg')
    except IOError:
        spacy.cli.download('en_core_web_lg')
        nlp = spacy.load('en_core_web_lg')
    """
    nlp.add_pipe(BeneparComponent('benepar_en'))
    with open(output_fn, 'w') as f:
        i = 0
        for doc in nlp.pipe(data, batch_size=batch, n_threads=threads):
            # benepar has no pipe() method, so manually invoke the pipe.
            doc = nlp.get_pipe('benepar')(doc)
            for sent in doc.sents:
                if len(sent.text) < MIN_SENTENCE_CHAR_LENGTH:
                    continue
                if i % 25 == 0:
                    print('%d..' % i, end='')
                i += 1
                f.write(u'< ' + sent.text + '\n')
                f.write(u'> ' + transform_present_span(sent) + '\n')
    print('complete')
def main():
    nlp = spacy.load('en')
    nlp.add_pipe(BeneparComponent('benepar_en2_large'))

    all_negation_examples = read_json(DATA_PATH + 'json/' + DATASET_NAME +
                                      '.json')
    write_path = DATA_PATH + 'conll/gold_cue/' + TASK + '_' + DATASET_NAME + '/'
    if not os.path.isdir(write_path):
        os.makedirs(write_path)

    cv = KFold(n_splits=10, shuffle=True, random_state=0)
    split_id = 0
    for _, valid_id in cv.split(all_negation_examples, all_negation_examples):
        valid_examples = [all_negation_examples[i] for i in valid_id]
        with open(write_path + 'train_cv' + str(split_id) + '.conll',
                  'w') as writer:
            for example in tqdm(valid_examples):
                token_list, sentence = get_sentences_and_tokens_from_spacy(
                    example[0], nlp, example[2], example[3])

                for token in token_list:
                    write_string = prepare_line(token, example[1])
                    writer.write(write_string + '\n')

                writer.write('\n')

        split_id += 1
Пример #6
0
 def __init__(
         self,
         categories: List[str],
         polarities: List[str],
         tokenizer: Callable[[str], List[str]] = lambda x: x.split(),
         token_indexers: Dict[str, TokenIndexer] = None,
         position_indexers: Dict[str, TokenIndexer] = None,
         core_nlp: my_corenlp.StanfordCoreNLP = None,
         configuration=None,
         bert_tokenizer=None,
         bert_token_indexers=None,
         sentence_constituency_indexer: Dict[str,
                                             TokenIndexer] = None) -> None:
     super().__init__(lazy=False)
     self.tokenizer = tokenizer
     self.token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer(namespace="tokens")
     }
     self.bert_tokenizer = bert_tokenizer
     self.bert_token_indexers = bert_token_indexers or {
         "bert": SingleIdTokenIndexer(namespace="bert")
     }
     self.position_indexers = position_indexers or {
         "position": SingleIdTokenIndexer(namespace='position')
     }
     self.sentence_constituency_indexer = sentence_constituency_indexer
     self.categories = categories
     self.polarities = polarities
     self.spacy_nlp = spacy.load("en_core_web_sm")
     self.spacy_nlp.add_pipe(BeneparComponent('benepar_en'))
     self.core_nlp = core_nlp
     self.configuration = configuration
Пример #7
0
def main():
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # disable GPU

    args = parse_args()

    nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])
    ConstituencyParser = BeneparComponent("benepar_en2")

    documents = get_documents(args.documents)

    for document in documents:
        with open(document, "r") as f:
            data = json.load(f)

        sentences = data["sentences"]

        gold_spans = data["ent_spans"]
        gold_span_boundaries = [(span[0], span[1]) for span in gold_spans]

        syntactic_spans = extract_spans(ConstituencyParser, nlp, sentences)
        syntactic_spans = [
            span for span in syntactic_spans
            if (span[0], span[1]) not in gold_span_boundaries
        ]  # deduplicate
        spans = syntactic_spans + gold_spans

        data["ent_spans"] = spans

        # write out new spans to copy of document (don't overwrite)
        out_document = os.path.join(args.output_dir,
                                    os.path.basename(document))
        with open(out_document, "w") as f:
            json.dump(data, f, indent=4)
Пример #8
0
    def __init__(self,
                 spacy_mdl='en_core_web_sm',
                 benepar_mdl='benepar_en2',
                 is_segmented=True,
                 is_tokenised=False,
                 batch_size=20,
                 take_sent_average=True,
                 scaler='minmax'):
        self.nlp = spacy.load(spacy_mdl)
        self.nlp.add_pipe(BeneparComponent(benepar_mdl), name='benepar')

        if is_segmented:
            self.nlp.add_pipe(self._prevent_sbd,
                              name='prevent-sbd',
                              before='parser')

        self.is_tokenised = is_tokenised
        if is_tokenised:
            self.nlp.tokenizer = self.nlp.tokenizer.tokens_from_list

        self.batch_size = batch_size
        self.take_sent_average = take_sent_average

        if scaler == 'minmax':
            self.scaler = MinMaxScaler()
        elif scaler == 'standard':
            self.scaler = StandardScaler()
        elif scaler is None:
            self.scaler = None
        else:
            raise ValueError(
                "'scaler' has an unexpected value. Use 'minmax' or 'standard' or None."
            )
Пример #9
0
 def __init__(self):
     self.nlp = spacy.load('en_core_web_sm')
     self.nlp.add_pipe(BeneparComponent("benepar_en_small"))
     self.dependency = Predictor.from_path(
         "https://s3-us-west-2.amazonaws.com/allennlp/models/biaffine-dependency-parser-ptb-2018.08.23.tar.gz"
     )
     self.rule_qa2d = rulebased_qa2d.RuleBasedQa2D(self.dependency,
                                                   self.nlp)
Пример #10
0
    def __init__(self, model_name='en'):
        load_t0 = time()
        print("load model file")
        # load spacy basic model1
        self.nlp = spacy.load(model_name)
        self.nlp.add_pipe(BeneparComponent('benepar_en_small'))
        load_t1 = time()

        print('* load model time: {:.2f}ms'.format((load_t1 - load_t0) * 1000))
Пример #11
0
def preprocessing(language: str):
    from benepar.spacy_plugin import BeneparComponent
    import zh_core_web_trf
    import en_core_web_trf
    global ucb_parser
    if language == 'zh':
        nlp = zh_core_web_trf.load()
        ucb_parser = BeneparComponent('benepar_zh')
    elif language == 'en':
        nlp = en_core_web_trf.load()
        ucb_parser = BeneparComponent('benepar_en2')
    else:
        print('language error')
        exit(-1)

    nlp.disable_pipes('tagger', 'parser', 'attribute_ruler')
    if language == 'en':
        nlp.disable_pipe('lemmatizer')
    nlp.add_pipe('component', name='cp_parser', last=True)
    return nlp
Пример #12
0
def get_model(spacy_model: str, coref: bool, constparse: bool) -> Language:
    """Loads a model for a language."""

    if spacy_model == 'en':
        spacy_model = 'en_core_web_sm'
    nlp = spacy.load(spacy_model)
    if coref:
        neuralcoref.add_to_pipe(nlp)
    if constparse:
        nlp.add_pipe(BeneparComponent("benepar_en2"))
    return nlp
Пример #13
0
def init():
    global nlp

    spacy.tokens.Doc.set_extension('features', default={}, force=True)

    nlp = spacy.load('en', disable=['nre'])
    nlp.add_pipe(BeneparComponent("benepar_en_small"))
    nlp.add_pipe(extract_doc_features,
                 name='extract_doc_features',
                 first=False)

    test_me()
Пример #14
0
def make_parse_trees(sents):
    nlp = spacy.load('en')
    nlp.add_pipe(BeneparComponent("benepar_en2"))
    tree_list = []
    for s in tqdm(sents):
        span = list(nlp(s).sents)[0]
        root_node = Node('ROOT', span)
        node = span_to_tree(span)
        root_node.add_child(node)
        root_node.make_rule()
        tree_list.append(root_node)
    return tree_list
Пример #15
0
def get_model(spacy_model: str, coref: bool, constituents: bool) -> Language:
    if spacy_model == 'en':
        spacy_model = 'en_core_web_sm'
    if spacy_model not in MODEL_NAMES:
        raise ModuleNotFoundError(f'No such spaCy model "{spacy_model}"')
    nlp = spacy.load(spacy_model)
    if coref and spacy_model in COREF:
        neuralcoref.add_to_pipe(nlp)
    if constituents:
        model = CONSTITUENTS.get(spacy_model[:2], "")
        if model:
            nlp.add_pipe(BeneparComponent(model))
    return nlp
Пример #16
0
def load_models(spacy_model: str,
                coref: str = '',
                constituents: str = '') -> Language:
    try:
        nlp = get_model(spacy_model)
        print('loaded spacy model: ' + spacy_model)
    except OSError as e:
        print(e)
        print('Missing spacy model. Try running: python spacy download ' +
              spacy_model)
        print('Defaulting to en_core_web_sm')
        nlp = spacy.load('en_core_web_sm')
    if coref:
        neuralcoref.add_to_pipe(nlp)
    if constituents:
        nlp.add_pipe(BeneparComponent(constituents))
    return nlp
Пример #17
0
def preprocess(sentence):
    clauses = []
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(BeneparComponent('benepar_en2'))
    doc = nlp(sentence)
    if len(list(doc.sents)) == 0:
        return clauses
    sent = list(doc.sents)[0]
    children = list(sent._.children)
    puncts = '?!.,;:-'
    for clause in children:
        if clause.text not in puncts:
            if 'S' in clause._.labels:
                clauses.append((clause.text + '.'))

    if not clauses:
        return [sentence]
    else:
        return clauses
Пример #18
0
def get_clauses(text):
    nlp = spacy.load('en')
    nlp.add_pipe(BeneparComponent("benepar_en"))

    text = text.replace(";", ".")
    text = text.replace("\n", " ")
    doc = nlp(text)

    subsentences = []
    for sent in doc.sents:
        subtexts = create_tree(sent)
        subtexts = lower_no_punct(subtexts)
        subsent1 = remove_double_subsents(subtexts)
        subsent1 = reorder_subsents(sent.text, subsent1)
        subsent2 = concatenate_sep_words(subsent1)
        final = capitalize_first_letters(subsent2)
        subsentences = subsentences + final

    return subsentences
def parse_file(file_path: str) -> list:
    def convert_bytes(num):
        """
        this function will convert bytes to MB.... GB... etc
        """
        for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
            if num < 1024.0:
                return "%3.1f %s" % (num, x)
            num /= 1024.0

    def get_file_size(file_path):
        """
        this function will return the file size
        """
        file_info = os.stat(file_path)
        return convert_bytes(file_info.st_size)

    file_size = get_file_size(file_path)
    if re.search(r'[GT]B', file_size):
        logger.error("File size is %s. This is too large!" % file_size)
        return []
    else:
        nlp = spacy.load("en_core_web_lg")
        nlp.add_pipe(BeneparComponent("benepar_en2"))
        f = open(file_path, "r")
        chapter = {}
        for idx,paragraph in enumerate(filter(lambda line: line != "", map(lambda line: line.strip('\n'), f.readlines()))):
            if idx == 0:
                title = paragraph
                chapter["Title"] = title
                continue
            else:
                doc = nlp(paragraph)
                chapter["Paragraphs"] = chapter.get("Paragraphs", []) + [parse_paragraph(doc)]
        print(chapter)
            # grammatical_sentences = list(doc.sents)


            # print(list(grammatical_sentences[0]))
            # pdb.set_trace()
        f.close()
Пример #20
0
def main(args):
    if args.cuda:
        spacy.require_gpu()
    # Load an spacy model (supported models are "es" and "en")
    print("Loading spacy...")
    nlp = spacy.load("en_core_web_lg")
    print("Done")
    nlp.tokenizer = lambda text: whitespace_tokenizer(text, nlp.vocab)
    nlp.add_pipe(WordnetAnnotator(nlp.lang), after="tagger")
    nlp.add_pipe(BeneparComponent("benepar_en2"))

    with open(args.data) as f:
        lines = [line.strip() for line in list(f)]

    all_texts = []
    all_feats = []
    docs = nlp.pipe(lines, batch_size=args.batch_size)
    for doc in tqdm(docs, desc="Extracting feats", total=len(lines)):
        doc_feats = []
        doc_texts = []
        for token in doc:
            t_feats = extract_feats(token)
            doc_feats.append(t_feats)
            doc_texts.append(token.text)
        all_feats.append(doc_feats)
        all_texts.append(doc_texts)

    with open(args.data.replace(".tok", ".feats"), "w") as f:
        f.write("|".join((";".join(fn[:2]) for fn in FEATS)))
        f.write("\n")
        for text, doc_feats in zip(all_texts, all_feats):
            t_feats_joined = ["|".join(tf) for tf in doc_feats]
            line_feats = " ".join(
                ["|".join((t, f)) for t, f in zip(text, t_feats_joined)])
            f.write(line_feats)
            f.write("\n")
Пример #21
0
def get_clauses_df(descriptions):
    nlp = spacy.load('en')
    nlp.add_pipe(BeneparComponent("benepar_en"))

    descriptions = descriptions.assign(clauses="")
    for i in range(len(descriptions)):
        text = descriptions.iloc[i, 0]
        text = text.replace(";", ".")
        text = text.replace("\n", " ")
        doc = nlp(text)

        subsentences = []
        for sent in doc.sents:
            subtexts = create_tree(sent)
            subtexts = lower_no_punct(subtexts)
            subsent1 = remove_double_subsents(subtexts)
            subsent1 = reorder_subsents(sent.text, subsent1)
            subsent2 = concatenate_sep_words(subsent1)
            final = capitalize_first_letters(subsent2)
            subsentences = subsentences + final

        descriptions.at[i, 'clauses'] = subsentences

    return descriptions
import tensorflow as tf
from models.vdcnn.classifier_protocol import VeryDeepCNN
from benepar.spacy_plugin import BeneparComponent
from polyglot.text import Text, Word

tf.flags.DEFINE_integer('layer_index', None, 'layer index')
tf.flags.DEFINE_integer('top_k', 10, '')
tf.flags.DEFINE_string('task', None, '')
tf.flags.DEFINE_integer('num_align', 10, 'number of concepts to be aligned')
tf.flags.DEFINE_integer('num_units', None, '')

FLAGS = tf.flags.FLAGS
model = VeryDeepCNN(task=FLAGS.task)

nlp = spacy.load('en')
nlp.add_pipe(BeneparComponent("benepar_en_small"))


def lemma_custom(token):
    if token.lemma_ == '-PRON-':
        return token.text

    if token.lemma_ == 'be':
        return token.text

    return token.lemma_


def get_layer_name(layer_index):
    return 'conv_%d' % layer_index
Пример #23
0
def nlp(scope='module'):
    import spacy
    from benepar.spacy_plugin import BeneparComponent
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(BeneparComponent('benepar_en'))
    return nlp
Пример #24
0
 def init_spacy_english_model():
     nlp = spacy.load('en_core_web_lg')
     nlp.add_pipe(BeneparComponent('benepar_en'))
     return nlp
Пример #25
0
 def __init__(self):
     self.nlp = spacy.load('en')
     self.nlp.add_pipe(BeneparComponent("benepar_en2"))
Пример #26
0
def nlp():
    nlp = spacy.load('en_core_web_lg')
    nlp.add_pipe(BeneparComponent('benepar_en'))
    return nlp
Пример #27
0
import spacy

nlp = spacy.load("en_core_web_lg")

from benepar.spacy_plugin import BeneparComponent
nlp.add_pipe(BeneparComponent("benepar_en2"))

doc = nlp(
    u"The alpine wildflowers are in bloom all around us. Truly a magnificent scene. I feel very privileged to be in this place of such jaw-dropping splendour. The one quibble I have with the world in this moment of bliss is that I feel slightly fatigued -- I suppose this is clear evidence, if any more was needed, that I have become physiologically dependent on coffee. Ah, coffee... I would really very much like some coffee. Drugs, loves, doves."
)

for sent in list(doc.sents):
    print(sent._.parse_string)
    print(sent._.labels)
    print(list(sent._.constituents))
    # for token in sent:
    #     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_)
'''(S (NP (DT The) (JJ alpine) (NNS wildflowers)) (VP (VBP are) (PP (IN in) (NP (NN bloom))) (PP (DT all) (IN around) (NP (PRP us)))) (. .))
('S',)
[The alpine wildflowers are in bloom all around us., The alpine wildflowers, The, alpine, wildflowers, are in bloom all around us, are, in bloom, in, bloom, all around us, all, around, us, .]


The the DET DT det
alpine alpine ADJ JJ compound
wildflowers wildflower NOUN NNS nsubj
are be AUX VBP ROOT
in in ADP IN prep
bloom bloom NOUN NN pobj
all all DET DT advmod
around around ADP IN prep
us -PRON- PRON PRP pobj
Пример #28
0
        node = ConstituencyTreeNode(labels,
                                    text,
                                    node_id=node_id,
                                    start=start,
                                    end=end,
                                    depth=depth)
        children = list(sentence._.children)
        for child in children:
            node.children.append(
                ConstituencyTreeNode._inner_parse_using_spacy(child,
                                                              serial_number,
                                                              depth=depth + 1))
        return node


if __name__ == '__main__':
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(BeneparComponent('benepar_en2'))
    sentence = 'Great taste bad service.'
    tree = ConstituencyTreeNode.parse_using_spacy(nlp, sentence)
    # adjacency_list = tree.get_adjacency_list()
    all_nodes = ConstituencyTreeNode.get_all_nodes(tree)
    adjacency_list = tree.get_adjacency_list_between_all_node_and_leaf()
    edges = [['%s-%s' % (e2.node_id, e2.text) for e2 in e1]
             for e1 in adjacency_list]
    g = nx.DiGraph()
    g.add_edges_from(edges)
    pos = nx.kamada_kawai_layout(g)
    nx.draw(g, pos=pos, with_labels=True)
    plt.show()
    print()
Пример #29
0
import json
import fileinput
import functools
from itertools import groupby

import spacy
from benepar.spacy_plugin import BeneparComponent
from nltk import Tree

nlp = spacy.load("en_core_web_sm", disable=["ner"])
benepar = BeneparComponent("benepar_en2")


def until_convergence(fn):
    @functools.wraps(fn)
    def wrapper(arg, *args, **kwargs):
        old = object()
        new = arg
        while old != new:
            old = new
            new = fn(old, *args, **kwargs)
        return new

    return wrapper


def join_tiny_clauses_with_next(clauses):
    # [["This", "is", "just"], ["because"], ["it", "is", "so"]]
    # [["This", "is", "just"], ["because", "it", "is", "so"]]
    clauses_rev = iter(reversed(clauses))