def add_tags(conllfile, taggedfile): FIELDS1 = cu.CONLL07_COLUMNS; FIELDS2 = ('form', 'postag', ); cu.FIELDS = ('id', 'form', 'cpostag', 'postag', ); orig_sents = cu.sentences_from_conll(ru.lines_from_file(conllfile), fields=FIELDS1); tag_sents = cu.sentences_from_conll(ru.lines_from_file(taggedfile), fields=FIELDS2); new_sents = map(updateTags, zip(orig_sents, tag_sents)); ru.lines_to_file("", cu.sentences_to_conll(new_sents)); return;
def get_lexiconfile(taggedfile, lexiconfile='', poskeys='', windowsize=0): global TAGGEDFILE_FIELDS; FIELDS = TAGGEDFILE_FIELDS; orig_sents = cu.sentences_from_conll(ru.lines_from_file(taggedfile), fields=FIELDS); if poskeys == '': pos_key = (FIELDS.index('udpostag'), ); # assume the default scenario to be UD tags else: pos_key = tuple(FIELDS.index(key.strip()) for key in poskeys.split(',')); mod_sents = ([(replace_token(sent[idx]['form']), "+".join(sent[idx][FIELDS[key]] for key in pos_key)) for idx in range(len(sent))] for sent in orig_sents); mod_sents = ([('BOS', 'BOS')]*windowsize+sent+[('EOS', 'EOS')]*windowsize for sent in mod_sents); #word_tags = ((sent[idx][0], '_'.join('%d:%s' %(i, sent[idx+i][1]) for i in range(-windowsize, windowsize+1))) for sent in mod_sents for idx in range(windowsize, len(sent)-windowsize)); word_tags = ((sent[idx][0], '_'.join('%s' %(sent[idx+i][1]) for i in range(-windowsize, windowsize+1))) for sent in mod_sents for idx in range(windowsize, len(sent)-windowsize)); freqs = defaultdict(int); forms_hash = defaultdict(lambda: counter().next); tags_hash = defaultdict(lambda: counter().next); for word, tag in word_tags: freqs[(forms_hash[word], tags_hash[tag])] += 1; inv_forms_hash = dict((wordidx, word) for word, wordidx in forms_hash.items()); inv_tags_hash = dict((wordidx, word) for word, wordidx in tags_hash.items()); freqs = ("%d\t%s\t%s" %(freq, inv_forms_hash[word[0]], inv_tags_hash[word[1]]) for word, freq in sorted(freqs.iteritems(), key=itemgetter(1), reverse=True)); ru.lines_to_file(lexiconfile, freqs); return;
def split_conll(): original_conllfile = sys.argv[1] if original_conllfile.endswith('.gz') or original_conllfile.endswith( '.bz2'): conllfilename, cext = os.path.splitext(original_conllfile) output_prefix, ext = os.path.splitext(conllfilename) import itertools splits = 20 sent_count = sum( imap(lambda x: 0 if x else 1, random_utils.lines_from_file(original_conllfile))) split_size = sent_count / splits + 1 with random_utils.smart_open(original_conllfile) as infile: conll_utils.FIELDS = conll_utils.CONLL09_COLUMNS foldlen = len(str(splits)) conll_sents = ( sentence for sentence in conll_utils.sentences_from_conll(infile)) for foldidx in xrange(splits): outfilepath = '%s-split%s%s%s' % ( output_prefix, str(foldidx + 1).zfill(foldlen), ext, cext) with random_utils.smart_open(outfilepath, 'w') as outfile: try: conll_utils.sentences_to_conll09( outfile, (conll_sents.next() for idx in xrange(split_size))) except StopIteration: pass return
def conll_to_tagged(): tagged = lambda conll_sent: ' '.join('%s%s%s' % (edge['form'], delim, edge['cpostag']) for edge in conll_sent) onlyforms = lambda conll_sent: ' '.join('%s' % (edge['form']) for edge in conll_sent) print '\n'.join(imap(tagged, conll_utils.sentences_from_conll(sys.stdin))) return
def preprocess_treebanks(conllfile, outconllfile): def lc_num(conll_sent): return map(lambda X: dict(X.items()+[('form', replace_token(X['form']))]), conll_sent); inputStream = ru.lines_from_file(conllfile); cu.FIELDS = cu.CONLL07_COLUMNS; conll_sents = cu.sentences_from_conll(inputStream); mod_conll_sents = map(lc_num, conll_sents); ru.lines_to_file(outconllfile, cu.sentences_to_conll07(mod_conll_sents)); return;
def unigram_freqs(taggedfile, unigramsfile=''): global TAGGEDFILE_FIELDS; FIELDS = TAGGEDFILE_FIELDS; orig_sents = cu.sentences_from_conll(ru.lines_from_file(taggedfile), fields=FIELDS); word_forms = (replace_token(edge['form']) for sent in orig_sents for edge in sent); freqs = defaultdict(int); for word in word_forms: freqs[word] += 1; freqs = ("%s\t%s" %(word, freq) for word, freq in sorted(freqs.iteritems(), key=itemgetter(1), reverse=True)); ru.lines_to_file(unigramsfile, freqs); return;
def conll07toconll09(): replicatePosTags = lambda X: [ dict(e.items() + [('ppostag', e['cpostag'])]) for e in X ] with random_utils.smart_open( sys.argv[1]) as infile, random_utils.smart_open(sys.argv[2], 'w') as outfile: conll_utils.FIELDS = conll_utils.CONLL07_COLUMNS conll_utils.sentences_to_conll09( outfile, imap(replicatePosTags, conll_utils.sentences_from_conll(infile))) return
def conll09toconll07(): ''' replicatePosTagsForGold = lambda X: [dict(filter(lambda x: x[0] not in ['plemma', 'ppostag', 'pfeats', 'phead', 'pdeprel'], e.items())+[('cpostag', e['postag']), ('postag', e['postag'])]) for e in X] replicatePosTags = lambda X: [dict(filter(lambda x: x[0] not in ['plemma', 'ppostag', 'pfeats', 'phead', 'pdeprel'], e.items())+\ [('lemma', e['lemma']), ('cpostag', e['ppostag']), ('postag', e['ppostag']), ('feats', e['pfeats']), ('head', e['phead']), ('deprel', e['pdeprel'])]) for e in X] #replicatePosTags = lambda X: [dict(filter(lambda x: x[0] not in ['plemma', 'ppostag', 'pfeats', 'phead', 'pdeprel'], e.items())+\ # [('lemma', e['plemma']), ('cpostag', e['ppostag']), ('postag', e['ppostag']), ('feats', e['pfeats']), ('head', e['phead']), ('deprel', e['pdeprel'])]) for e in X] if len(sys.argv) > 2: gold=True; replicatePosTags = replicatePosTagsForGold; ''' # -- in future this function has to be changed as above. For the oov-parsing project: replicateBlindPosTags = lambda X: [ dict([('id', e['id']), ('form', e['form']), ('lemma', e['lemma']), ('cpostag', e['ppostag']), ('postag', e['ppostag'])]) for e in X ] replicateGoldPosTags = lambda X: [ dict( filter(lambda x: x[0] not in ['postag', 'phead', 'pdeprel'], e.items()) + [('cpostag', e['postag']), ('postag', e['postag'])]) for e in X ] replicatePosTags = lambda X: [ dict( filter(lambda x: x[0] not in ['postag', 'phead', 'pdeprel'], e.items()) + [('cpostag', e['ppostag']), ('postag', e['ppostag'])]) for e in X ] replicateSystemPosTags = lambda X: [ dict( filter(lambda x: x[0] not in ['postag', 'phead', 'pdeprel'], e.items()) + [('cpostag', e['ppostag']), ('postag', e['ppostag']), ('head', e['phead']), ('deprel', e['pdeprel'])]) for e in X ] if len(sys.argv) > 3 and sys.argv[3] == 'system': replicatePosTags = replicateSystemPosTags elif len(sys.argv) > 3 and sys.argv[3] == 'blind': replicatePosTags = replicateBlindPosTags elif len(sys.argv) > 3 and sys.argv[3] == 'gold': replicatePosTags = replicateGoldPosTags with random_utils.smart_open( sys.argv[1]) as infile, random_utils.smart_open(sys.argv[2], 'w') as outfile: conll_utils.FIELDS = conll_utils.CONLL09_COLUMNS conll_utils.sentences_to_conll07( outfile, imap(replicatePosTags, conll_utils.sentences_from_conll(infile))) return
def reorder_cross_validated_sentences(): ''' This function is necessary to shuffle conll sentences to restore the original order; this happens because the cross-validation in preparing pos tagged treebanks, shuffles the sentences; ''' original_conllfile = sys.argv[1] shuffled_conllfile = sys.argv[2] ordering_map = {} sentences_buf = {} sentences_hash = defaultdict(list) with random_utils.smart_open(original_conllfile) as infile: conll_utils.FIELDS = conll_utils.CONLL09_COLUMNS for idx, sentence in enumerate( conll_utils.sentences_from_conll(infile)): sentences_hash[' '.join( '%s|%s|%s|%s' % (edge['form'], edge['postag'], edge['head'], edge['deprel']) for edge in sentence)].append(idx) print >> sys.stderr, len(sentences_hash), max(sentences_hash.values()) #assert(len(sentences_hash) == max(sentences_hash.values())+1); with random_utils.smart_open(shuffled_conllfile) as infile2: for idx, sentence in enumerate( conll_utils.sentences_from_conll(infile2)): tok_sent = ' '.join( '%s|%s|%s|%s' % (edge['form'], edge['postag'], edge['head'], edge['deprel']) for edge in sentence) if tok_sent not in sentences_hash: # something is terribly wrong; sys.exit(1) pass for idx in sentences_hash[tok_sent]: ordering_map[idx] = sentence conll_utils.sentences_to_conll09( sys.stdout, (ordering_map[idx] for idx in sorted(ordering_map.keys()))) return
def main(): if len(sysargv) < 2: print >> stderr, "./%s <map-file>" % (sysargv[0]) return global map_tag map_tag = read_mapping(sysargv[1]) inputFileName = sysargv[2] if len(sysargv) >= 3 else '' outputFileName = sysargv[3] if len(sysargv) >= 4 else '' with random_utils.smart_open( inputFileName) as infile, random_utils.smart_open( outputFileName, 'wb') as outfile: conll_utils.sentences_to_conll07( outputFile, imap(convert_tags, conll_utils.sentences_from_conll(inputFile))) return
def strip_everything_except_pos(): with random_utils.smart_open( sys.argv[1]) as infile, random_utils.smart_open(sys.argv[2], 'w') as outfile: preprocessed_sentences = [] for sentence in conll_utils.sentences_from_conll(infile, comments=True): preprocessed_sentences.append([ dict([('id', edge['id']), ('form', edge['form']), ('cpostag', edge['cpostag']), ('postag', edge['cpostag']), ('head', edge['head']), ('deprel', edge['deprel'])]) for edge in sentence ]) #preprocessed_sentences.append([dict([('id', edge['id']), ('form', edge['form']), ('cpostag', edge['postag']), ('postag', edge['postag']), ('head', edge['head']), ('deprel', edge['deprel'])]) for edge in sentence]); # --for Hindi dataset; conll_utils.sentences_to_conll09(outfile, preprocessed_sentences) return
def jackknifing(): conllfile = sys.argv[1] outdir = sys.argv[2] folds = int(sys.argv[3]) import os if not os.path.isdir(outdir): os.makedirs(outdir) with random_utils.smart_open(conllfile) as infile: conll_utils.FIELDS = conll_utils.CONLL09_COLUMNS sentences_hash = dict((idx, sentence) for idx, sentence in enumerate( conll_utils.sentences_from_conll(infile))) fold_indices = dict( (i, i % folds) for i in xrange(len(sentences_hash))) fold_splits = defaultdict(list) [fold_splits[v].append(k) for k, v in fold_indices.iteritems()] foldlen = len(str(folds)) for fold_idx in fold_splits: filename, fileext = os.path.splitext(os.path.split(conllfile)[1]) outfilepath = os.path.join( outdir, '%s-split%s%s' % (filename, str(fold_idx + 1).zfill(foldlen), fileext)) outfilepath2 = os.path.join( outdir, '%s-rest%s%s' % (filename, str(fold_idx + 1).zfill(foldlen), fileext)) with random_utils.smart_open(outfilepath, 'w') as outfile: conll_utils.sentences_to_conll09( outfile, (sentences_hash[idx] for idx in sorted(fold_splits[fold_idx]))) with random_utils.smart_open(outfilepath2, 'w') as outfile: sids = chain.from_iterable(fold_splits[idx] for idx in fold_splits if idx != fold_idx) conll_utils.sentences_to_conll09(outfile, (sentences_hash[idx] for idx in sorted(sids))) return