예제 #1
0
def line2features(line, args, vocab, pairs_file, sub, subsampler):
    win = int(args['--win'])
    ngram_word = int(args['--ngram_word'])
    ngram_context = int(args['--ngram_context'])
    overlap = args['--overlap']
    rnd = Random(17)
    tokens = line.strip().split()
    for i in range(len(tokens)):  #loop for each position in a line
        for gram_word in range(
                1, ngram_word +
                1):  #loop for grams of different orders in (center) word
            word = getNgram(tokens, i, gram_word)
            word = check_word(word, vocab, sub, subsampler, rnd)
            if word is None:
                continue
            for gram_context in range(
                    1, ngram_context +
                    1):  #loop for grams of different orders in context
                start = i - win + gram_word - 1
                end = i + win - gram_context + 1
                for j in range(start, end + 1):
                    if overlap:
                        if i == j and gram_word == gram_context:
                            continue
                    else:
                        if len(
                                set(range(i, i + gram_word))
                                & set(range(j, j + gram_context))) > 0:
                            continue
                    context = getNgram(tokens, j, gram_context)
                    context = check_word(context, vocab, sub, subsampler, rnd)
                    if context is None:
                        continue
                    pairs_file.write(word + ' ' + context +
                                     "\n")  #write pairs to the file
예제 #2
0
def line2features(line, args, vocab, pairs_file, sub, subsampler):
    win = int(args['--win'])
    ngram_word = int(args['--ngram_word'])
    ngram_context = int(args['--ngram_context'])
    overlap = args['--overlap']
    rnd = Random(17)
    tokens = line.strip().split()
    for i in range(len(tokens)): #loop for each position in a line
        for gram_word in range(1, ngram_word+1): #loop for grams of different orders in (center) word 
            word = getNgram(tokens, i, gram_word)
            word = check_word(word, vocab, sub, subsampler, rnd)
            if word is None:
                continue
            for gram_context in range(1, ngram_context+1): #loop for grams of different orders in context
                start = i - win + gram_word - 1
                end = i + win - gram_context + 1
                for j in range(start, end + 1):
                    if overlap:
                        if i == j and gram_word == gram_context:
                            continue
                    else:
                        if len(set(range(i, i + gram_word)) & set(range(j, j + gram_context))) > 0:
                            continue
                    context = getNgram(tokens, j, gram_context)
                    context = check_word(context, vocab, sub, subsampler, rnd)
                    if context is None:
                        continue
                    pairs_file.write(word + ' ' + context + "\n") #write pairs to the file
예제 #3
0
def word_word(line, args, vocab, pairs_file, sub, subsampler
              ):  #identical to the word2vec toolkit; dynamic and dirty window!
    rnd = Random(17)
    win = int(args['--win'])
    win = rnd.randint(1, win)  #dynamic window
    tokens = [t if t in vocab else None for t in line.strip().split()]
    if sub:
        tokens = [
            t if t not in subsampler or rnd.random() > subsampler[t] else None
            for t in tokens
        ]
    tokens = [t for t in tokens if t is not None]  #dirty window
    for i in range(len(tokens)):  #loop for each position in a line
        word = getNgram(tokens, i, 1)
        if word is None:
            continue
        start = i - win
        end = i + win
        for j in range(start, end + 1):
            if i == j:
                continue
            context = getNgram(tokens, j, 1)
            if context is None:
                continue
            pairs_file.write(word + ' ' + context + "\n")
예제 #4
0
def word_character(line, args, vocab, pairs_file, sub, subsampler): #identical to the word2vec toolkit; dynamic and dirty window!
    rnd = Random(17)
    char_range = (int(args['--ngram_char_low']), int(args['--ngram_char_up'])) #character range
    win = int(args['--win'])
    dynamic = args['--dynamic_win']
    if dynamic:
        win = rnd.randint(1, win) #dynamic window
    tokens = [t if t in vocab else None for t in line.strip().split()]
    if sub:
        tokens = [t if t not in subsampler or rnd.random() > subsampler[t] else None for t in tokens]
    for i in range(len(tokens)): #loop for each position in a line
        word = getNgram(tokens, i, 1)
        if word is None:
            continue
        start = i - win
        end = i + win
        for j in range(start, end + 1):
            context = getNgram(tokens, j, 1)
            if context is None:
                continue
            if i == j:
                characters = []
                for character in context.decode('utf-8'):
                    characters.append(character)
                for char_ngram in range(char_range[0], char_range[1] + 1):
                    for char_start in range(len(characters)):
                        char_end = char_start + char_ngram
                        if char_end > len(characters):
                            break
                        pairs_file.write(word + ' ' + ''.join([char.encode('utf-8') for char in characters[char_start: char_end]]) + "\n") 
   
                continue
            pairs_file.write(word + ' ' + context + "\n")
예제 #5
0
def word_word(line, args, vocab, pairs_file, sub, subsampler):
    win = int(args['--win'])
    rnd = Random(17)
    tokens = line.strip().split()
    for i in range(len(tokens)):  #loop for each position in a line
        word = getNgram(tokens, i, 1)
        word = check_word(word, vocab, sub, subsampler, rnd)
        if word is None:
            continue
        start = i - win
        end = i + win
        for j in range(start, end + 1):
            if i == j:
                continue
            context = getNgram(tokens, j, 1)
            context = check_word(context, vocab, sub, subsampler, rnd)
            if context is None:
                continue
            pairs_file.write(word + ' ' + context + "\n")
예제 #6
0
def word_character(
        line, args, vocab, pairs_file, sub, subsampler
):  #identical to the word2vec toolkit; dynamic and dirty window!
    rnd = Random(17)
    char_range = (int(args['--ngram_char_low']), int(args['--ngram_char_up'])
                  )  #character range
    win = int(args['--win'])
    dynamic = args['--dynamic_win']
    if dynamic:
        win = rnd.randint(1, win)  #dynamic window
    tokens = [t if t in vocab else None for t in line.strip().split()]
    if sub:
        tokens = [
            t if t not in subsampler or rnd.random() > subsampler[t] else None
            for t in tokens
        ]
    for i in range(len(tokens)):  #loop for each position in a line
        word = getNgram(tokens, i, 1)
        if word is None:
            continue
        start = i - win
        end = i + win
        for j in range(start, end + 1):
            context = getNgram(tokens, j, 1)
            if context is None:
                continue
            if i == j:
                characters = []
                for character in context.decode('utf-8'):
                    characters.append(character)
                for char_ngram in range(char_range[0], char_range[1] + 1):
                    for char_start in range(len(characters)):
                        char_end = char_start + char_ngram
                        if char_end > len(characters):
                            break
                        pairs_file.write(word + ' ' + ''.join([
                            char.encode('utf-8')
                            for char in characters[char_start:char_end]
                        ]) + "\n")

                continue
            pairs_file.write(word + ' ' + context + "\n")
예제 #7
0
def word_text(line, args, vocab, pairs_file, sub, subsampler, text_id):
    rnd = Random(17)
    tokens = line.strip().split()
    if len(tokens) < 200:
        return
    for i in range(len(tokens)):  #loop for each position in a line
        word = getNgram(tokens, i, 1)
        word = check_word(word, vocab, sub, subsampler, rnd)
        if word is None:
            continue
        pairs_file.write('#' + str(text_id) + ' ' + word + "\n")
예제 #8
0
def word_text(line, args, vocab, pairs_file, sub, subsampler, text_id):
    rnd = Random(17)
    tokens = line.strip().split()
    if len(tokens) < 200:
        return
    for i in range(len(tokens)): #loop for each position in a line
        word = getNgram(tokens, i, 1)
        word = check_word(word, vocab, sub, subsampler, rnd)
        if word is None:
            continue
        pairs_file.write('#' + str(text_id) + ' ' + word + "\n")
예제 #9
0
def word_word(line, args, vocab, pairs_file, sub, subsampler): #identical to the word2vec toolkit; dynamic and dirty window!
    rnd = Random(17)
    win = int(args['--win'])
    win = rnd.randint(1, win) #dynamic window
    tokens = [t if t in vocab else None for t in line.strip().split()]
    if sub:
        tokens = [t if t not in subsampler or rnd.random() > subsampler[t] else None for t in tokens]
    tokens = [t for t in tokens if t is not None] #dirty window
    for i in range(len(tokens)): #loop for each position in a line
        word = getNgram(tokens, i, 1)
        if word is None:
            continue
        start = i - win
        end = i + win
        for j in range(start, end + 1):
            if i == j:
                continue
            context = getNgram(tokens, j, 1)
            if context is None:
                continue
            pairs_file.write(word + ' ' + context + "\n")
예제 #10
0
def word_wordLR(line, args, vocab, pairs_file, sub, subsampler):
    win = int(args['--win'])
    rnd = Random(17)
    tokens = line.strip().split()
    for i in range(len(tokens)): #loop for each position in a line
        word = getNgram(tokens, i, 1)
        word = check_word(word, vocab, sub, subsampler, rnd)
        if word is None:
            continue
        start = i - win
        end = i + win
        for j in range(start, end + 1):
            if i == j:
                continue
            context = getNgram(tokens, j, 1)
            context = check_word(context, vocab, sub, subsampler, rnd)
            if context is None:
                continue
            if j < i:
                pairs_file.write(word + ' ' + context + '#L' + "\n")
            else:
                pairs_file.write(word + ' ' + context + '#R' + "\n")
예제 #11
0
def c2p(args, tid):
    pairs_file = open(args['<pairs>'] + "_" + str(tid), 'w')
    win = int(args['--win'])
    subsample = float(args['--sub'])
    sub = subsample != 0
    ngram_word = int(args['--ngram_word'])
    ngram_context = int(args['--ngram_context'])
    overlap = args['--overlap']
    threads_num = int(args['--threads_num'])

    vocab = load_count_vocabulary(
        args['<vocab>'])  #load vocabulary (generated in corpus2vocab stage)
    train_uni_num = 0  #number of (unigram) tokens in corpus
    for w, c in vocab.iteritems():
        if '@$' not in w:
            train_uni_num += c
    train_num = sum(vocab.values())  #number of (ngram) tokens in corpus
    if tid == 0:
        print 'vocabulary size: ' + str(len(vocab))
        print 'number of training words (uni-grams): ' + str(train_uni_num)
        print 'number of training n-grams: ' + str(train_num)
    subsample *= train_uni_num
    if sub:
        subsampler = dict([(word, 1 - sqrt(subsample / count))
                           for word, count in vocab.iteritems()
                           if count > subsample])  #subsampling technique

    rnd = Random(17)
    with open(args['<corpus>']) as f:
        line_num = 0
        if tid == 0:
            print str(line_num / 1000**1) + "K lines processed."
        for line in f:
            line_num += 1
            if ((line_num) % 1000) == 0 and tid == 0:
                print "\x1b[1A" + str(line_num / 1000) + "K lines processed."
            if line_num % threads_num != tid:
                continue
            tokens = line.strip().split()
            for i in xrange(len(tokens)):  #loop for each position in a line
                for gram_word in xrange(
                        1, ngram_word + 1
                ):  #loop for grams of different orders in (center) word
                    word = getNgram(tokens, i, gram_word)
                    word = check_word(word, vocab, sub, subsampler, rnd)
                    if word is None:
                        continue
                    for gram_context in xrange(
                            1, ngram_context +
                            1):  #loop for grams of different orders in context
                        start = i - win + gram_word - 1
                        end = i + win - gram_context + 1
                        for j in xrange(start, end + 1):
                            if overlap:
                                if i == j and gram_word == gram_context:
                                    continue
                            else:
                                if len(
                                        set(range(i, i + gram_word))
                                        & set(range(j, j + gram_context))) > 0:
                                    continue
                            context = getNgram(tokens, j, gram_context)
                            context = check_word(context, vocab, sub,
                                                 subsampler, rnd)
                            if context is None:
                                continue
                            pairs_file.write(word + ' ' + context +
                                             "\n")  #write pairs to the file
    pairs_file.close()