def txt_transformer(file_conllu): s_list = list() with open(file_conllu, 'r') as f: ok = f.read() try: conll = pc.load_from_string(ok) except pc.exception.ParseError: conll = pc.load_from_string(to_conllu(ok)) for s in conll: s_txt = "" for word in s[:-1]: s_txt = s_txt + " " + word.form s_txt = s_txt.strip() + ".\n" s_list.append(s_txt) return u''.join(s_list).encode('utf-8')
def load_sentences(n, f): count = 0 lines = "" line = f.readline() while line and count < n: lines += line # line.replace("sent_id", "sent_id =") if (opt.use_v1 and opt.get_ids) else line if line == "\n": count += 1 line = f.readline() not_empty = True if line else False try: conll = load_from_string(lines) except Exception: conll = load_from_string("") print("bad conll") return conll, not_empty
def load_predictions(args): # Regular CoNLL-U format if args.pred_upos_index == 3 and args.pred_xpos_index == 4 and args.pred_feats_index == 5: return pyconll.load_from_file(args.prediction) # other format else: s = "" with open(args.prediction, 'r') as pred_file: for line in pred_file: if line.strip() == "": s += line elif line.startswith("#"): s += line else: elements = line.split("\t") if args.pred_upos_index >= 0 and args.pred_upos_index < len(elements): upos = elements[args.pred_upos_index].strip() else: upos = "_" if args.pred_xpos_index >= 0 and args.pred_xpos_index < len(elements): xpos = elements[args.pred_xpos_index].strip() else: xpos = "_" if args.pred_feats_index >= 0 and args.pred_feats_index < len(elements): feats = elements[args.pred_feats_index].strip() else: feats = "_" s += "0\t_\t_\t{}\t{}\t{}\t0\t_\t_\t_\n".format(upos, xpos, feats) return pyconll.load_from_string(s)
def __call__(self, text): sentences = self.tokenize(text) for s in sentences: self.tag(s) self.parse(s) conllu = self.write(sentences, "conllu") result = pyconll.load_from_string(conllu) return result
def parse_syntax(self, text_str): processed = self.udpipe_pipeline.process(text_str, self.udpipe_error) if self.udpipe_error.occurred(): logging.error("An error occurred when running run_udpipe: %s", self.udpipe_error.message) return None parsed_data = pyconll.load_from_string(processed)[0] return parsed_data
def __call__(self, text, ner_list): sentences = self.tokenize(text) for s in sentences: self.tag(s) self.parse(s) conllu = self.write(sentences, "conllu") result = pyconll.load_from_string(conllu) for word, ner_tag in zip(result[0], ner_list): word.misc = ner_tag return result
def test_load_from_string(): """ Test that a CoNLL file can properly be loaded from a string. """ with open(fixture_location('basic.conll')) as f: contents = f.read() c = load_from_string(contents) sent = c[1] assert len(c) == 4 assert len(sent) == 14 assert sent['10'].form == 'donc'
def add_speakers(conll_string, path_to_csv): import pyconll import pandas as pd df = pd.read_csv(path_to_csv) speaker_list = df["Speaker"] conll = pyconll.load_from_string(conll_string) assert len(conll) == len(speaker_list), \ "List of speakers and list of sentences not in sync ({} speakers, {} sentences)".format(len(speaker_list), len(conll)) for c, s in zip(conll, speaker_list): c.set_meta("speaker", s) return conll
def get_triples(input_part=9100): """ Empty nsubj and dobj are represented by empty string. """ config = configparser.ConfigParser() config.read('config.ini') outfilen = os.path.join(config['DEFAULT']['ProjectDirectory'], f'dataframe/freq{input_part}.pkl') if os.path.exists(outfilen): logging.info(f'File exists: {outfilen}') return triples = [] for filen in sorted( glob( f'/mnt/permanent/Language/English/Crawl/DepCC/corpus/parsed/part-m-*{input_part}.gz' )): logging.info(filen) for sent_str in depcc_to_conllu(filen): try: train = pyconll.load_from_string(sent_str) except Exception as e: logging.error(e) continue sentence = train.pop() # sent_str is only one sentence. triples_in_sent = defaultdict(lambda: {'nsubj': '', 'dobj': ''}) # triples = {id_of_root: {'nsubj': 'dog'}} # Collecting the arguments.. for token in sentence: if token.deprel in ['nsubj', 'dobj']: triples_in_sent[token.head][token.deprel] = token.lemma # Collecting the verbs, not only the main pred of the sentence. for id_form_1 in triples_in_sent: if sentence[int(id_form_1)].upos.startswith('V'): verb = sentence[int(id_form_1)].lemma triples_in_sent[id_form_1]['ROOT'] = verb # Appending full triples to the list.. for triple in triples_in_sent.values(): if 'ROOT' in triple: triples.append(triple) df = pd.DataFrame(triples) df = df.groupby(list(df.columns)).size().sort_values(ascending=False) df.to_frame(name='freq').to_pickle(outfilen)
def test_load_from_file_and_string_equivalence(): """ Test that the Conll object created from a string and file is the same if the underlying source is the same. """ with open(fixture_location('long.conll')) as f: contents = f.read() str_c = load_from_string(contents) file_c = load_from_file(fixture_location('long.conll')) assert len(str_c) == len(file_c) for i in range(len(str_c)): assert str_c[i].id == file_c[i].id assert str_c[i].text == file_c[i].text print(str_c[i].conll()) print(file_c[i].conll()) for str_token in str_c[i]: file_token = file_c[i][str_token.id] assert_token_members(str_token, file_token.id, file_token.form, file_token.lemma, file_token.upos, file_token.xpos, file_token.feats, file_token.head, file_token.deprel, file_token.deps, file_token.misc)
def check(self, sentence): doc_de = self.nlp_de(sentence) conllu = doc_de.conll_file.conll_as_string() conll = pyconll.load_from_string(conllu) print(dir(conll)) translate_client = translate.Client() english = translate_client.translate( sentence, target_language='en')['translatedText'] print("ENGLISH " + html.unescape(english)) doc_en = self.nlp_en(html.unescape(english)) #print(doc_de.conll_file.conll_as_string()) sentences_ret = [] for sentence in doc_de.sentences: words_ret = [ReturnWord() for _ in range(len(sentence.words))] for word in sentence.words: #print("[%d] Word: %s" % (int(word.index), word.text)) eng = self.translator.translate(word.lemma, src='de', dest='en') #print(dir(words_ret[int(word.index) - 1].translations)) #print(format(eng.extra_data)) #print(format(eng.extra_data['all-translations'])) words_ret[int(word.index) - 1].translation = eng.text if (eng.extra_data['all-translations'] is not None): words_ret[int(word.index) - 1].other_translations.extend( eng.extra_data['all-translations'][0][1]) words_ret[int(word.index) - 1].set_vars( word.text, int(word.index), word.upos, word.xpos, word.governor, word.dependency_relation) #print("%s, %s, %s, %s" % (words_ret[int(word.index) - 1].text, words_ret[int(word.index) - 1].upos, words_ret[int(word.index) - 1].xpos, words_ret[int(word.index) - 1].relation)) if (word.upos == 'ADP'): #print("%s is ADP" % word.text) if (word.lemma in self.prep_akk): #print("%s is accusative" % word.text) words_ret[int(word.index) - 1].notes.append( word.text + " takes the accusative case.") words_ret[word.governor - 1].notes.append( "Because " + word.text + " is accusative, so is " + sentence.words[word.governor - 1].text) words_ret[word.governor - 1].case = "Acc" elif (word.lemma in self.prep_dat): #print("%s is dative" % word.text) words_ret[int(word.index) - 1].notes.append(word.text + " takes the dative case.") words_ret[word.governor - 1].notes.append( "Because " + word.text + " is dative, so is " + sentence.words[word.governor - 1].text) words_ret[word.governor - 1].case = "Dat" elif (word.lemma in self.prep_acc_dat): #print("%s is akk/dat" % word.text) words_ret[int(word.index) - 1].notes.append( word.text + " takes Dative if it answers the question 'where?'" ) words_ret[int(word.index) - 1].notes.append( word.text + " takes the Accusative if it answers the question 'where to?'" ) words_ret[word.governor - 1].notes.append( sentence.words[word.governor - 1].text + " takes either the Accusative or Dative case.") words_ret[word.governor - 1].case = "Acc|Dat" if (word.upos == 'NOUN'): words_ret[int(word.index) - 1].gender = (self.nlp_de( word.text).sentences[0].words[0].feats.split('|') [1].split('=')[1]) #print("%s, %s" % (words_ret[int(word.index) - 1].gender, word.text)) feats = word.feats.split('|') for feat in feats: pair = feat.split('=') #print("%s: %s" % (word.text, pair)) if (pair[0] == "Gender" and words_ret[int(word.index) - 1].gender == ""): words_ret[int(word.index) - 1].gender = pair[1] elif (pair[0] == "Number"): words_ret[int(word.index) - 1].number = pair[1] elif (pair[0] == "Case" and words_ret[int(word.index) - 1].case == ""): words_ret[int(word.index) - 1].case = pair[1] #if(len(pair) is 2): print("%s gender: %s (pair[1] is %s)" % (word.text, words_ret[int(word.index) - 1].gender, pair[1])) if (word.dependency_relation == 'obj' or word.dependency_relation == 'dobj'): words_ret[int(word.index) - 1].notes.append( word.text + " is probably a direct object, which takes the accusative case." ) words_ret[int(word.index) - 1].case = "Acc" if (word.dependency_relation == 'iobj'): words_ret[int(word.index) - 1].notes.append( word.text + " is probably an indirect object, which takes the dative case." ) words_ret[int(word.index) - 1].case = "Dat" for word in sentence.words: #print("%s == 'ART' ? %s" % (word.xpos, (word.xpos == 'ART'))) if (word.xpos == 'ART'): #print(words_ret[word.governor - 1].case) if ('Acc' in words_ret[word.governor - 1].case): #print("%s is Acc %s" % (str(words_ret[word.governor - 1].text), str(words_ret[word.governor - 1].case))) nom = self.nominative[words_ret[word.governor - 1].gender] article = self.accusative[words_ret[word.governor - 1].gender] words_ret[int(word.index) - 1].notes.append( "If %s %s takes the accusative case the article should be %s" % (nom, words_ret[word.governor - 1].text, article)) if ('Dat' in words_ret[word.governor - 1].case): #print("%s is Dat %s" % (str(words_ret[word.governor - 1].text), str(words_ret[word.governor - 1].case))) nom = self.nominative[words_ret[word.governor - 1].gender] article = self.dative[words_ret[word.governor - 1].gender] words_ret[int(word.index) - 1].notes.append( "If %s %s takes the dative case the article should be %s" % (nom, words_ret[word.governor - 1].text, article)) if ('Gen' in words_ret[word.governor - 1].case): #print("%s is Gen %s" % (str(words_ret[word.governor - 1].text), str(words_ret[word.governor - 1].case))) nom = self.nominative[words_ret[word.governor - 1].gender] article = self.genitive[words_ret[word.governor - 1].gender] words_ret[int(word.index) - 1].notes.append( "If %s %s takes the genitive case the article should be %s" % (nom, words_ret[word.governor - 1].text, article))
return new_sents, alt_feat_dict if __name__ == "__main__": parser = argparse.ArgumentParser(description="generate noisy outputs") parser.add_argument("-unimorph", type=str, help="unimorph file for lg") parser.add_argument("-orig", type=str, help="original conllu") parser.add_argument("-alt", type=str, help="altered conllu") args = parser.parse_args() treebank = args.orig new_treebank = args.alt data = pyconll.load_from_file(treebank) um = read_um(args.unimorph) um2ud, ud2um = read_ud_um() random.seed(23) new_sents, alt_feat_dict = sample_noise(data, um, um2ud) with open(new_treebank, "w") as op: for new_sent in new_sents: sent_ = pyconll.load_from_string("\n".join(new_sent)) assert len(sent_) == 1, "more than one sentence" # op.write("# text = " + " ".join([t_.form for t_ in sent_[0] if t_.form]) + "\n") op.write("\n".join(new_sent) + "\n\n") alt_feat_dict = sorted(alt_feat_dict.items(), key=lambda x: x[0]) print(args.unimorph.split("/")[-1]) for (pos, feat), count in alt_feat_dict: print("%s:%s\t%.2f" % (pos, feat, count * 100 / len(new_sents)))