Exemplo n.º 1
0
def main(word_file, pos_file, ngram_file, ngram_word_file, ngram_ai=None):
    if ngram_ai is None:
        ngram_id = ngram.max_id('ngram')
    else:
        ngram_id = ngram_ai
    pos_key = init_pos_key(pos_file)
    pos_file.close()
    for line in sys.stdin:
        try:
            words, freq = line.rstrip().split('\t')
            ngram_id += 1
            words = words.split()
            print('%d\t%d\t%s' \
                % (ngram_id, len(words), freq), file=ngram_file)

            for i, word in enumerate(words):
                try:
                    w, pos = word.rsplit('_', 1)
                    pid = pos_key[pos]
                    wid = word_id(w, word_file)
                except:  # no POS tag or invalid POS tag
                    wid = word_id(word, word_file)
                    pid = '\\N'
                print('%d\t%d\t%d\t%s' \
                    % (ngram_id, i, wid, pid), file=ngram_word_file)

            if ngram_id % PROGRESS == 0:
                print(ngram_id, file=sys.stderr)
        except Exception as e:
            print(e, file=sys.stderr)
            print(line)
def main(word_file, pos_file, ngram_file, ngram_word_file, ngram_ai=None):
    if ngram_ai is None:
        ngram_id = ngram.max_id('ngram')
    else: ngram_id = ngram_ai
    pos_key = init_pos_key(pos_file)
    pos_file.close()
    for line in sys.stdin:
        try:
            words, freq = line.rstrip().split('\t')
            ngram_id += 1
            words = words.split()
            print >>ngram_file, '%d\t%d\t%s' \
                % (ngram_id, len(words), freq)
                
            for i, word in enumerate(words):
                try:
                    w, pos = word.rsplit('_', 1)
                    pid = pos_key[pos]
                    wid = word_id(w, word_file)
                except: # no POS tag or invalid POS tag
                    wid = word_id(word, word_file)
                    pid = '\\N'
                print >>ngram_word_file, '%d\t%d\t%d\t%s' \
                    % (ngram_id, i, wid, pid)
            
            if ngram_id % PROGRESS == 0:
                print >>sys.stderr, ngram_id
        except Exception, e:
            print >>sys.stderr, e
            print line
Exemplo n.º 3
0
def main(word_file, pos_file, dep_file, arc_file, arc_word_file, n):
    print("Processing %d-arcs" % n, file=sys.stderr)

    arc_id = ngram.max_id('arc') + 1
    for line in sys.stdin:
        try:
            entry = line.strip().split('\t', 3)
            freq = entry[2]
            print('%d\t%d\t%s' % (arc_id, n, freq), file=arc_file)

            words = entry[1].split()
            for i, word in enumerate(words):
                word, pos, dep, head_index = ngram.parse_word(word)
                wid = word_id(word, word_file)
                pid = pos_id(pos, pos_file)
                did = dep_id(dep, dep_file)
                print('%d\t%d\t%d\t%d\t%d\t%s' \
                    % (arc_id, i, wid, pid, did, head_index), file=arc_word_file)


#            if not skip_years:
#                for field in entry[3].split('\t'):
#                    year, count = field.split(',')
#                    print >>arc_freq_file, '%d\t%s\t%s' % (arc_id, year, count)
        except Exception as e:
            print(e, file=sys.stderr)
            print(line)
        else:
            arc_id += 1

        if arc_id % PROGRESS == 0:
            print(arc_id, file=sys.stderr)
def main(word_file, pos_file, dep_file, arc_file, arc_word_file, n):
    print >>sys.stderr, "Processing %d-arcs" % n

    arc_id = ngram.max_id('arc') + 1
    for line in sys.stdin:
        try:
            entry = line.strip().split('\t', 3)
            freq = entry[2]
            print >>arc_file, '%d\t%d\t%s' % (arc_id, n, freq)

            words = entry[1].split()
            for i, word in enumerate(words):
                word, pos, dep, head_index = ngram.parse_word(word)
                wid = word_id(word, word_file)
                pid = pos_id(pos, pos_file)
                did = dep_id(dep, dep_file)
                print >>arc_word_file, '%d\t%d\t%d\t%d\t%d\t%s' \
                    % (arc_id, i, wid, pid, did, head_index)
            
#            if not skip_years:
#                for field in entry[3].split('\t'):
#                    year, count = field.split(',')
#                    print >>arc_freq_file, '%d\t%s\t%s' % (arc_id, year, count)
        except Exception, e:
            print >>sys.stderr, e
            print line
        else:
            arc_id += 1
            
        if arc_id % PROGRESS == 0:
            print >>sys.stderr, arc_id
Exemplo n.º 5
0
def main(word_file, pos_file, ngram_file, ngram_word_file, ngram_freq_file):
    ngram_id = ngram.max_id('ngram')
    ngram.cur.close()
    ngram.db.close()
    cur_ngram = None
    total_freq = 0
    for line in sys.stdin:
        try:
            entry = line.rstrip().split('\t')
            if entry[0] != cur_ngram:  # new n-gram, import words
                # Write previous ngram to file
                if cur_ngram is not None:
                    print('%d\t%d\t%d' \
                        % (ngram_id, len(words), total_freq), file=ngram_file)

                cur_ngram = entry[0]
                ngram_id += 1
                total_freq = 0
                words = entry[0].split()
                if ngram_id % PROGRESS == 0:
                    print(ngram_id, file=sys.stderr)

                for i, word in enumerate(words):
                    try:
                        word, pos = word.split('_')
                        pid = pos_id(pos, pos_file)
                    except:
                        pid = 'NULL'
                    finally:
                        wid = word_id(word, word_file)
                    print('%d\t%d\t%d\t%s' \
                        % (ngram_id, i, wid, pid), file=ngram_word_file)

            #year = entry[1]
            freq = int(entry[2])
            #vol = entry[3]
            #print >>ngram_freq_file, '%d\t%s\t%d\t%s' % (ngram_id, year, freq, vol)
            total_freq += freq
        except Exception as e:
            print(e, file=sys.stderr)
            print(line)

    # The last ngram
    print('%d\t%d\t%d' \
        % (ngram_id, len(words), total_freq), file=ngram_file)
def main(word_file, pos_file, ngram_file, ngram_word_file, ngram_freq_file):
    ngram_id = ngram.max_id('ngram')
    ngram.cur.close()
    ngram.db.close()
    cur_ngram = None
    total_freq = 0
    for line in sys.stdin:
        try:
            entry = line.rstrip().split('\t')
            if entry[0] != cur_ngram: # new n-gram, import words
                # Write previous ngram to file
                if cur_ngram is not None:
                    print >>ngram_file, '%d\t%d\t%d' \
                        % (ngram_id, len(words), total_freq)
                
                cur_ngram = entry[0]
                ngram_id += 1
                total_freq = 0
                words = entry[0].split()
                if ngram_id % PROGRESS == 0:
                    print >>sys.stderr, ngram_id
                
                for i, word in enumerate(words):
                    try: 
                        word, pos = word.split('_')
                        pid = pos_id(pos, pos_file)
                    except: 
                        pid = 'NULL'
                    finally:
                        wid = word_id(word, word_file)
                    print >>ngram_word_file, '%d\t%d\t%d\t%s' \
                        % (ngram_id, i, wid, pid)
                
            #year = entry[1]
            freq = int(entry[2])
            #vol = entry[3]
            #print >>ngram_freq_file, '%d\t%s\t%d\t%s' % (ngram_id, year, freq, vol)
            total_freq += freq
        except Exception, e:
            print >>sys.stderr, e
            print line