Пример #1
0
def augment(inputs, outputs, tags, hallucinator, min_len=3, max_len=10):
    temp = [(''.join(inp), ''.join(out)) for inp, out in zip(inputs, outputs)]
    # aligned returns pairs of strings with spaces for null alignments
    aligned = align.Aligner(temp).alignedpairs

    new_inputs = []
    new_outputs = []
    new_tags = []
    for k, (src, trg) in enumerate(aligned):
        good_ranges = find_good_range(src, trg, min_len, max_len)
        if good_ranges:
            new_src, new_trg = list(src), list(trg)
            for good_range in good_ranges:
                s, e = good_range
                gold_seq = new_src[s: e]
                hallucinated_seq = hallucinator.sample(gold_seq)
                new_src[s: e] = new_trg[s: e] = hallucinated_seq

            # trim, unless src and trg have an aligned whitespace
            new_i1 = [c for i, c in enumerate(new_src)
                      if (c.strip() or (new_src[i] == new_trg[i] == ' '))]
            new_o1 = [c for i, c in enumerate(new_trg)
                      if (c.strip() or (new_src[i] == new_trg[i] == ' '))]

            new_inputs.append(new_i1)
            new_outputs.append(new_o1)
            new_tags.append(tags[k])

    return new_inputs, new_outputs, new_tags
def smart_align(pairs, align_symbol=ALIGN_SYMBOL,
                iterations=150, burnin=5, lag=1, mode='crp', **kwargs):
    return align.Aligner(pairs,
                         align_symbol=align_symbol,
                         iterations=iterations,
                         burnin=burnin,
                         lag=lag,
                         mode=mode).alignedpairs
Пример #3
0
def augment(inputs, outputs, tags, characters):
    temp = [(''.join(inputs[i]), ''.join(outputs[i]))
            for i in range(len(outputs))]
    aligned = align.Aligner(temp, align_symbol=' ').alignedpairs

    vocab = list(characters)
    try:
        vocab.remove(u" ")
    except:
        pass

    new_inputs = []
    new_outputs = []
    new_tags = []
    for k, item in enumerate(aligned):
        #print(''.join(inputs[k]) + '\t' + ''.join(outputs[k]))
        i, o = item[0], item[1]
        good_range = find_good_range(i, o)
        #print(good_range)
        if good_range:
            new_i, new_o = list(i), list(o)
            for r in good_range:
                s = r[0]
                e = r[1]
                if (e - s > 5):  #arbitrary value
                    s += 1
                    e -= 1
                for j in range(s, e):
                    if random() > 0.5:  #arbitrary value
                        nc = choice(vocab)
                        new_i[j] = nc
                        new_o[j] = nc
            new_i1 = [
                c for l, c in enumerate(new_i)
                if (c.strip() or (new_o[l] == ' ' and new_i[l] == ' '))
            ]
            new_o1 = [
                c for l, c in enumerate(new_o)
                if (c.strip() or (new_i[l] == ' ' and new_o[l] == ' '))
            ]
            new_inputs.append(new_i1)
            new_outputs.append(new_o1)
            new_tags.append(tags[k])
        else:
            new_inputs.append([])
            new_outputs.append([])
            new_tags.append([])

    return new_inputs, new_outputs, new_tags
Пример #4
0
def process(organism_ids, align_method, similarity_mode,
            power_alpha=cs.ALPHA_BIAS, check=True, visual=False):

    # load bio_net object
    bio_net = initialize_network(organism_ids, align_method,
                                 similarity_mode, power_alpha)

    # create aligner object
    aligner = align.Aligner(align_method)

    alignment = aligner.align(bio_net, check=check)

    # visualization
    if visual:
        visualize.gephi_organism_ppi(bio_net.org1)
        visualize.gephi_organism_ppi(bio_net.org2)
        visualize.gephi_network_aligned(alignment, bio_net)
        visualize.gephi_network_aligned_comp(alignment, bio_net)

    return alignment
Пример #5
0
def med_align(wordpairs, align_symbol):
    a = align.Aligner(wordpairs, align_symbol=align_symbol, mode='med')
    return a.alignedpairs
Пример #6
0
def mcmc_align(wordpairs, align_symbol):
    a = align.Aligner(wordpairs, align_symbol=align_symbol)
    return a.alignedpairs
Пример #7
0
    citationforms = {
        c
        for c in citationforms if citationforms[c] > 4 and citationforms[c] /
        float(citationforms[c] + negcitationforms[c]) >= 0.95
    }

    for l in lines2:
        msd1, form1, msd2, form2 = l.split(u'\t')
        if msd1 in citationforms and msd1 != msd2:
            traindata1.append((form1, form2, msd2))
        if msd2 in citationforms and msd1 != msd2:
            traindata1.append((form2, form1, msd1))

if task == 1 or not constrained:
    wordpairs = [(x[0], x[1]) for x in traindata1]
    a = align.Aligner(wordpairs, align_symbol=u'_', iterations=30)
    traindata1 = [(traindata1[i][0], traindata1[i][1], traindata1[i][2],
                   a.alignedpairs[i][0], a.alignedpairs[i][1])
                  for i in range(len(traindata1))]

C, V = consvowOCP.candv(words)

# Lemma > form
if task == 1 or not constrained:
    for lemma, form, msd, lemmaaligned, formaligned in traindata1:
        if msd not in fromlemma:
            fromlemma[msd] = []
        if msd not in tolemma:
            tolemma[msd] = []

        alignedpair1 = (lemmaaligned, formaligned)
Пример #8
0
def mcmc_align(wordpairs, align_symbol):
    a = align.Aligner(wordpairs, align_symbol=align_symbol, random_seed=42)
    return a.alignedpairs