Exemplo n.º 1
0
def txt_transformer(file_conllu):
    s_list = list()
    with open(file_conllu, 'r') as f:
        ok = f.read()
    try:
        conll = pc.load_from_string(ok)
    except pc.exception.ParseError:
        conll = pc.load_from_string(to_conllu(ok))
    for s in conll:
        s_txt = ""
        for word in s[:-1]:
            s_txt = s_txt + " " + word.form
        s_txt = s_txt.strip() + ".\n"
        s_list.append(s_txt)
    return u''.join(s_list).encode('utf-8')
Exemplo n.º 2
0
def load_sentences(n, f):
    count = 0
    lines = ""
    line = f.readline()
    while line and count < n:
        lines += line  # line.replace("sent_id", "sent_id =") if (opt.use_v1 and opt.get_ids) else line
        if line == "\n":
            count += 1
        line = f.readline()
    not_empty = True if line else False
    try:
        conll = load_from_string(lines)
    except Exception:
        conll = load_from_string("")
        print("bad conll")
    return conll, not_empty
Exemplo n.º 3
0
def load_predictions(args):
	# Regular CoNLL-U format
	if args.pred_upos_index == 3 and args.pred_xpos_index == 4 and args.pred_feats_index == 5:
		return pyconll.load_from_file(args.prediction)
	
	# other format
	else:
		s = ""
		with open(args.prediction, 'r') as pred_file:
			for line in pred_file:
				if line.strip() == "":
					s += line
				elif line.startswith("#"):
					s += line
				else:
					elements = line.split("\t")
					if args.pred_upos_index >= 0 and args.pred_upos_index < len(elements):
						upos = elements[args.pred_upos_index].strip()
					else:
						upos = "_"
					if args.pred_xpos_index >= 0 and args.pred_xpos_index < len(elements):
						xpos = elements[args.pred_xpos_index].strip()
					else:
						xpos = "_"
					if args.pred_feats_index >= 0 and args.pred_feats_index < len(elements):
						feats = elements[args.pred_feats_index].strip()
					else:
						feats = "_"
					s += "0\t_\t_\t{}\t{}\t{}\t0\t_\t_\t_\n".format(upos, xpos, feats)
		return pyconll.load_from_string(s)
Exemplo n.º 4
0
 def __call__(self, text):
     sentences = self.tokenize(text)
     for s in sentences:
         self.tag(s)
         self.parse(s)
     conllu = self.write(sentences, "conllu")
     result = pyconll.load_from_string(conllu)
     return result
Exemplo n.º 5
0
    def parse_syntax(self, text_str):
        processed = self.udpipe_pipeline.process(text_str, self.udpipe_error)
        if self.udpipe_error.occurred():
            logging.error("An error occurred when running run_udpipe: %s", self.udpipe_error.message)
            return None

        parsed_data = pyconll.load_from_string(processed)[0]
        return parsed_data
Exemplo n.º 6
0
 def __call__(self, text, ner_list):
     sentences = self.tokenize(text)
     for s in sentences:
         self.tag(s)
         self.parse(s)
     conllu = self.write(sentences, "conllu")
     result = pyconll.load_from_string(conllu)
     for word, ner_tag in zip(result[0], ner_list):
         word.misc = ner_tag
     return result
Exemplo n.º 7
0
def test_load_from_string():
    """
    Test that a CoNLL file can properly be loaded from a string.
    """
    with open(fixture_location('basic.conll')) as f:
        contents = f.read()

    c = load_from_string(contents)
    sent = c[1]

    assert len(c) == 4
    assert len(sent) == 14
    assert sent['10'].form == 'donc'
def add_speakers(conll_string, path_to_csv):
    import pyconll
    import pandas as pd

    df = pd.read_csv(path_to_csv)
    speaker_list = df["Speaker"]

    conll = pyconll.load_from_string(conll_string)

    assert len(conll) == len(speaker_list), \
        "List of speakers and list of sentences not in sync ({} speakers, {} sentences)".format(len(speaker_list),
                                                                                               len(conll))
    for c, s in zip(conll, speaker_list):
        c.set_meta("speaker", s)

    return conll
Exemplo n.º 9
0
def get_triples(input_part=9100):
    """
    Empty nsubj and dobj are represented by empty string.
    """
    config = configparser.ConfigParser()
    config.read('config.ini')
    outfilen = os.path.join(config['DEFAULT']['ProjectDirectory'],
                            f'dataframe/freq{input_part}.pkl')
    if os.path.exists(outfilen):
        logging.info(f'File exists: {outfilen}')
        return
    triples = []
    for filen in sorted(
            glob(
                f'/mnt/permanent/Language/English/Crawl/DepCC/corpus/parsed/part-m-*{input_part}.gz'
            )):
        logging.info(filen)
        for sent_str in depcc_to_conllu(filen):
            try:
                train = pyconll.load_from_string(sent_str)
            except Exception as e:
                logging.error(e)
                continue
            sentence = train.pop()  # sent_str is only one sentence.
            triples_in_sent = defaultdict(lambda: {'nsubj': '', 'dobj': ''})
            # triples = {id_of_root: {'nsubj': 'dog'}}

            # Collecting the arguments..
            for token in sentence:
                if token.deprel in ['nsubj', 'dobj']:
                    triples_in_sent[token.head][token.deprel] = token.lemma

            # Collecting the verbs, not only the main pred of the sentence.
            for id_form_1 in triples_in_sent:
                if sentence[int(id_form_1)].upos.startswith('V'):
                    verb = sentence[int(id_form_1)].lemma
                    triples_in_sent[id_form_1]['ROOT'] = verb

            # Appending full triples to the list..
            for triple in triples_in_sent.values():
                if 'ROOT' in triple:
                    triples.append(triple)

    df = pd.DataFrame(triples)
    df = df.groupby(list(df.columns)).size().sort_values(ascending=False)
    df.to_frame(name='freq').to_pickle(outfilen)
Exemplo n.º 10
0
def test_load_from_file_and_string_equivalence():
    """
    Test that the Conll object created from a string and file is the same if
    the underlying source is the same.
    """
    with open(fixture_location('long.conll')) as f:
        contents = f.read()
    str_c = load_from_string(contents)
    file_c = load_from_file(fixture_location('long.conll'))

    assert len(str_c) == len(file_c)
    for i in range(len(str_c)):
        assert str_c[i].id == file_c[i].id
        assert str_c[i].text == file_c[i].text
        print(str_c[i].conll())
        print(file_c[i].conll())

        for str_token in str_c[i]:
            file_token = file_c[i][str_token.id]
            assert_token_members(str_token, file_token.id, file_token.form,
                                 file_token.lemma, file_token.upos,
                                 file_token.xpos, file_token.feats,
                                 file_token.head, file_token.deprel,
                                 file_token.deps, file_token.misc)
Exemplo n.º 11
0
    def check(self, sentence):
        doc_de = self.nlp_de(sentence)
        conllu = doc_de.conll_file.conll_as_string()
        conll = pyconll.load_from_string(conllu)
        print(dir(conll))
        translate_client = translate.Client()

        english = translate_client.translate(
            sentence, target_language='en')['translatedText']
        print("ENGLISH " + html.unescape(english))
        doc_en = self.nlp_en(html.unescape(english))

        #print(doc_de.conll_file.conll_as_string())
        sentences_ret = []

        for sentence in doc_de.sentences:
            words_ret = [ReturnWord() for _ in range(len(sentence.words))]
            for word in sentence.words:
                #print("[%d] Word: %s" % (int(word.index), word.text))
                eng = self.translator.translate(word.lemma,
                                                src='de',
                                                dest='en')
                #print(dir(words_ret[int(word.index) - 1].translations))
                #print(format(eng.extra_data))
                #print(format(eng.extra_data['all-translations']))
                words_ret[int(word.index) - 1].translation = eng.text
                if (eng.extra_data['all-translations'] is not None):
                    words_ret[int(word.index) - 1].other_translations.extend(
                        eng.extra_data['all-translations'][0][1])
                words_ret[int(word.index) - 1].set_vars(
                    word.text, int(word.index), word.upos, word.xpos,
                    word.governor, word.dependency_relation)
                #print("%s, %s, %s, %s" % (words_ret[int(word.index) - 1].text, words_ret[int(word.index) - 1].upos, words_ret[int(word.index) - 1].xpos, words_ret[int(word.index) - 1].relation))
                if (word.upos == 'ADP'):
                    #print("%s is ADP" % word.text)
                    if (word.lemma in self.prep_akk):
                        #print("%s is accusative" % word.text)
                        words_ret[int(word.index) - 1].notes.append(
                            word.text + " takes the accusative case.")
                        words_ret[word.governor - 1].notes.append(
                            "Because " + word.text + " is accusative, so is " +
                            sentence.words[word.governor - 1].text)
                        words_ret[word.governor - 1].case = "Acc"
                    elif (word.lemma in self.prep_dat):
                        #print("%s is dative" % word.text)
                        words_ret[int(word.index) -
                                  1].notes.append(word.text +
                                                  " takes the dative case.")
                        words_ret[word.governor - 1].notes.append(
                            "Because " + word.text + " is dative, so is " +
                            sentence.words[word.governor - 1].text)
                        words_ret[word.governor - 1].case = "Dat"
                    elif (word.lemma in self.prep_acc_dat):
                        #print("%s is akk/dat" % word.text)
                        words_ret[int(word.index) - 1].notes.append(
                            word.text +
                            " takes Dative if it answers the question 'where?'"
                        )
                        words_ret[int(word.index) - 1].notes.append(
                            word.text +
                            " takes the Accusative if it answers the question 'where to?'"
                        )
                        words_ret[word.governor - 1].notes.append(
                            sentence.words[word.governor - 1].text +
                            " takes either the Accusative or Dative case.")
                        words_ret[word.governor - 1].case = "Acc|Dat"
                if (word.upos == 'NOUN'):
                    words_ret[int(word.index) - 1].gender = (self.nlp_de(
                        word.text).sentences[0].words[0].feats.split('|')
                                                             [1].split('=')[1])
                    #print("%s, %s" % (words_ret[int(word.index) - 1].gender, word.text))

                feats = word.feats.split('|')
                for feat in feats:
                    pair = feat.split('=')
                    #print("%s: %s" % (word.text, pair))
                    if (pair[0] == "Gender"
                            and words_ret[int(word.index) - 1].gender == ""):
                        words_ret[int(word.index) - 1].gender = pair[1]
                    elif (pair[0] == "Number"):
                        words_ret[int(word.index) - 1].number = pair[1]
                    elif (pair[0] == "Case"
                          and words_ret[int(word.index) - 1].case == ""):
                        words_ret[int(word.index) - 1].case = pair[1]
                    #if(len(pair) is 2): print("%s gender: %s (pair[1] is %s)" % (word.text, words_ret[int(word.index) - 1].gender, pair[1]))
                if (word.dependency_relation == 'obj'
                        or word.dependency_relation == 'dobj'):
                    words_ret[int(word.index) - 1].notes.append(
                        word.text +
                        " is probably a direct object, which takes the accusative case."
                    )
                    words_ret[int(word.index) - 1].case = "Acc"
                if (word.dependency_relation == 'iobj'):
                    words_ret[int(word.index) - 1].notes.append(
                        word.text +
                        " is probably an indirect object, which takes the dative case."
                    )
                    words_ret[int(word.index) - 1].case = "Dat"

            for word in sentence.words:
                #print("%s == 'ART' ? %s" % (word.xpos, (word.xpos == 'ART')))
                if (word.xpos == 'ART'):
                    #print(words_ret[word.governor - 1].case)
                    if ('Acc' in words_ret[word.governor - 1].case):
                        #print("%s is Acc %s" % (str(words_ret[word.governor - 1].text), str(words_ret[word.governor - 1].case)))
                        nom = self.nominative[words_ret[word.governor -
                                                        1].gender]
                        article = self.accusative[words_ret[word.governor -
                                                            1].gender]
                        words_ret[int(word.index) - 1].notes.append(
                            "If %s %s takes the accusative case the article should be %s"
                            %
                            (nom, words_ret[word.governor - 1].text, article))
                    if ('Dat' in words_ret[word.governor - 1].case):
                        #print("%s is Dat %s" % (str(words_ret[word.governor - 1].text), str(words_ret[word.governor - 1].case)))
                        nom = self.nominative[words_ret[word.governor -
                                                        1].gender]
                        article = self.dative[words_ret[word.governor -
                                                        1].gender]
                        words_ret[int(word.index) - 1].notes.append(
                            "If %s %s takes the dative case the article should be %s"
                            %
                            (nom, words_ret[word.governor - 1].text, article))
                    if ('Gen' in words_ret[word.governor - 1].case):
                        #print("%s is Gen %s" % (str(words_ret[word.governor - 1].text), str(words_ret[word.governor - 1].case)))
                        nom = self.nominative[words_ret[word.governor -
                                                        1].gender]
                        article = self.genitive[words_ret[word.governor -
                                                          1].gender]
                        words_ret[int(word.index) - 1].notes.append(
                            "If %s %s takes the genitive case the article should be %s"
                            %
                            (nom, words_ret[word.governor - 1].text, article))
Exemplo n.º 12
0
    return new_sents, alt_feat_dict


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="generate noisy outputs")
    parser.add_argument("-unimorph", type=str, help="unimorph file for lg")
    parser.add_argument("-orig", type=str, help="original conllu")
    parser.add_argument("-alt", type=str, help="altered conllu")
    args = parser.parse_args()

    treebank = args.orig
    new_treebank = args.alt

    data = pyconll.load_from_file(treebank)

    um = read_um(args.unimorph)
    um2ud, ud2um = read_ud_um()

    random.seed(23)
    new_sents, alt_feat_dict = sample_noise(data, um, um2ud)
    with open(new_treebank, "w") as op:
        for new_sent in new_sents:
            sent_ = pyconll.load_from_string("\n".join(new_sent))
            assert len(sent_) == 1, "more than one sentence"
            # op.write("# text = " + " ".join([t_.form for t_ in sent_[0] if t_.form]) + "\n")
            op.write("\n".join(new_sent) + "\n\n")
    alt_feat_dict = sorted(alt_feat_dict.items(), key=lambda x: x[0])
    print(args.unimorph.split("/")[-1])
    for (pos, feat), count in alt_feat_dict:
        print("%s:%s\t%.2f" % (pos, feat, count * 100 / len(new_sents)))