Пример #1
0
def parseCMF(fout):
    try:
        with open(RES_DIR + "output.txt", "r") as CMFResults:
            foundMotifs["CMF"] = []
            foundMotifsSeqs["CMF"] = DD(lambda: DD(list))
            alreadyFound = {}
            readmode = 0 # 0=looking for motif,1=reading pos,2=looking for pos
            for line in CMFResults:
                if readmode == 2:
                    readmode = 1
                    continue
                if readmode == 1:
                    if len(line) < 2:
                        readmode = 0
                        continue
                    seqName, mpos = line.split("\t")[:2]
                    seqName = seqName.strip()
                    #pdb.set_trace()
                    foundMotifs["CMF"][-1][1].append(SEQ_LENS[seqName[1:]] - int(mpos))
                    foundMotifsSeqs["CMF"][motif][seqName[1:]].append(int(mpos))
                    fout.write("\t"+str(SEQ_LENS[seqName[1:]] - int(mpos)))
                    readmode = 2
                    continue
                if line[0:7] == "MOTIF:\t" and not line[7:] in alreadyFound:
                    motif = line[7:-1]
                    fout.write("\nCMF\t"+line[7:-1])
                    foundMotifs["CMF"] += [[line[7:-1], []]]
                    #foundMotifsSeqs["CMF"][motif] = DD(list)
                    #alreadyFound[line[7:]] = 0
                elif "Positive Sites:" in line:
                    readmode = 1
    except IOError:
        print "Error opening CMF's results file"
Пример #2
0
 def __init__(self, getIndex=0, viewIndex=-1):
     self.getIndex, self.viewIndex = getIndex, viewIndex #TODO: Add sanity checks?
     self.key2mon = DD(lambda:DD(set)) # Maps a normal key to keys that monitor it.
     self.monkeys = set()              # List of monitor keys.
     # store and waiters are mutually exclusive, and could be kept in the same place
     self.store = DD(list)
     self.waiters = DD(list)
     self.opCounts = {b'get': 0, b'put': 0, b'view': 0, b'wait': 0}
     self.ac, self.rc = 0, 0
Пример #3
0
async def get_fkeys(company, table_name):
    with await fkey_lock:
        if company not in fkeys:

            src_fkeys = DD(list)
            tgt_fkeys = DD(list)

            src_fkey = NT('src_fkey',
                          'src_col, tgt_tbl, tgt_col, alt_src, alt_tgt, test')
            tgt_fkey = NT('tgt_fkey',
                          'src_tbl, src_col, tgt_col, is_child, test')

            sql = ("SELECT b.table_name, a.col_name, a.fkey "
                   f"FROM {company}.db_columns a, {company}.db_tables b "
                   "WHERE b.row_id = a.table_id "
                   "AND a.deleted_id = 0 "
                   "AND a.fkey IS NOT NULL")

            async with db_session.get_connection() as db_mem_conn:
                conn = db_mem_conn.db
                cur = await conn.exec_sql(sql)

                async for src_tbl, src_col, fkey in cur:
                    tgt_tbl, tgt_col, alt_src, alt_tgt, is_child, cursor = loads(
                        fkey)
                    if isinstance(tgt_tbl, str):  # normal case
                        test = None
                        src_fkeys[src_tbl].append(
                            src_fkey(src_col, tgt_tbl, tgt_col, alt_src,
                                     alt_tgt, test))
                        tgt_fkeys[tgt_tbl].append(
                            tgt_fkey(src_tbl, src_col, tgt_col, is_child,
                                     test))
                    else:
                        col_name, vals_tables = tgt_tbl
                        for val, tgt_tbl in vals_tables:
                            test = (col_name, val)
                            src_fkeys[src_tbl].append(
                                src_fkey(src_col, tgt_tbl, tgt_col, alt_src,
                                         alt_tgt, test))
                            tgt_fkeys[tgt_tbl].append(
                                tgt_fkey(src_tbl, src_col, tgt_col, is_child,
                                         test))

            fkeys[company] = src_fkeys, tgt_fkeys

    comp_fkeys = fkeys[company]
    src_fkeys = comp_fkeys[0][table_name]  # returns [] if not found
    tgt_fkeys = comp_fkeys[1][table_name]  # returns [] if not found
    return src_fkeys, tgt_fkeys
Пример #4
0
def parseBioProspector(fout):
    # read sequence mappings
    mappings = {}
    mfname = FILTERED_SEQ.split('/')[-1]
    with open("bp_files/bp_" + mfname + ".mappings", "r") as mapIn:
        for line in mapIn:
            line = line.split(">>")
            mappings[line[0]] = line[1].strip()
    with open(RES_DIR + "bp_output.txt", "r") as bpResults:
        foundMotifs["BioProspector"] = []
        foundMotifsSeqs["BioProspector"] = {}
        for line in bpResults:
            if "Motif #" in line:
                motif = line[line.index("(")+1:line.index("/")]
                fout.write("\nBioProspector\t"+motif)
                foundMotifs["BioProspector"] += [[motif, []]]
                foundMotifsSeqs["BioProspector"][motif] = DD(list)
            elif ">seq" in line:
                slen = int(line.split()[2])
                mpos = int(line.split()[-1])
                if line.split()[-2] == 'r':
                    mpos -= len(motif)
                foundMotifs["BioProspector"][-1][1].append(slen - mpos)
                foundMotifsSeqs["BioProspector"][motif][mappings[line.split()[0][1:]]].append(mpos)
                fout.write("\t" + str(slen - mpos))
Пример #5
0
def getSMData(SMDir, targetNode, start, stop):
    #print ("getSMdata %s"%targetNode)
    sm  = SearchIndex(SMDir+'/%s_sm.px'%targetNode, 40, compTimestamps)
    smd = IndexedHostData(SMDir, targetNode)

    usr2d = DD(list)
    pos = sm.find('%020d'%start)
    #print (pos)
    for x in range(pos, sm.len):
        offset = int(sm[x][20:])
        ts, nd = smd.readData (offset, stop)
        if ( nd == None): break

        #print("nd=" + repr(nd))
        for usrdata in nd[3:]: # username, userdata
            usr2d[usrdata[0]].append([ts] + list(usrdata[1:7]))
        
    lseries, mseries = [], []
    for usrname in sorted(usr2d.keys()):
        l, m = [], []
        for e in usr2d[usrname]:
            ts = e[0]*1000
            l.append([ts, e[4]])
            m.append([ts, e[6]])
        lseries.append({'data': l, 'name': usrname})
        mseries.append({'data': m, 'name': usrname})
    
    #[{'name':username, 'data':[[timestamp, value]...]} ...]
    return lseries, mseries
Пример #6
0
def voteRank(sequences, motifs):
    poll = {}
    for seq in sequences:
        poll[seq] = [0.0] * len(sequences[seq])
    
    # perform poll
    for tool in motifs:
        for motif in motifs[tool]:
            for seq in motifs[tool][motif]:
                sequence = best(sequences, seq)
                for pos in motifs[tool][motif][seq]:
                    for i in xrange(pos, pos + len(motif)):
                        try:
                            # instead of weighting all results the same (1), we
                            # could bias based on tool or number of results or something like that
                            poll[sequence][i - 1] += 1
                        except Exception as e:
                            print e
                            print 'It appears a tool has reported finding a motif',\
                                'outside the bounds of a sequence'
                            print 'such as finding a motif of length 10 at position',\
                                '195 in a sequence with length 200'
                            pdb.set_trace()
    # add up votes for each motif
    ress = DD(int)
    for tool in motifs:
        for motif in motifs[tool]:
            for seq in motifs[tool][motif]:
                for pos in motifs[tool][motif][seq]:
                    for p in xrange(pos, pos + len(motif)):
                        ress[motif] += poll[best(sequences, seq)][p-1]
    # sort motifs by number of votes
    return sorted(map(lambda a: list(a[::-1]), ress.iteritems()))
Пример #7
0
def parseMEME(fout):
    try:
        with open(RES_DIR + "/meme/meme.txt", "r") as memeResults:
            foundMotifs["MEME"] = []
            foundMotifsSeqs["MEME"] = {}
            readMode = 0 # 0 is looking for next motif, 1 is looking for pos's, 2 is reading pos's
            for line in memeResults:
                if readMode == 1:
                    readMode = 2
                    continue
                if readMode == 2:
                    if "----------------------" in line:
                        readMode = 0
                        continue
                    seqName, sPos = line.split()[0:2]
                    seqName = seqName[0:19].strip()
                    foundMotifs["MEME"][-1][1].append(SEQ_LENS[seqName] - int(sPos))
                    foundMotifsSeqs["MEME"][motif][seqName].append(int(sPos))
                    fout.write("\t" + str(SEQ_LENS[seqName] - int(sPos)))
                if "Multilevel" in line:
                    motif = line.strip().replace("Multilevel","").replace(" ","")
                    fout.write("\nMEME\t"+motif)
                    foundMotifs["MEME"] += [[motif, []]]
                    foundMotifsSeqs["MEME"][motif] = DD(list)
                elif "Sequence name" in line and "Start" in line:
                    readMode = 1
    except IOError:
        print "Error opening MEME's results file"
Пример #8
0
def parseXXmotif(fout):
    try:
        foundMotifs['XXmotif'] = []
        foundMotifsSeqs['XXmotif'] = {}
        reportedMotifs = [None]
        lets = 'ACGT'
        baseFname = RES_DIR + '.'.join(
            FILTERED_SEQ.split('/')[-1].split('.')[:-1])
        #baseFname = 'XXmotif/results/hsap_core_promoters_all'
        # parse motifs
        #pdb.set_trace()
        with open(baseFname + '.pwm') as XXmotifResults:
            cm = []
            for i, line in enumerate(XXmotifResults):
                if i % 6 == 0 or i % 6 == 5:
                    if len(cm) == 4:
                        motif = ''.join([
                            lets[p.index(max(p))]
                            for p in zip(cm[0], cm[1], cm[2], cm[3])
                        ])
                        foundMotifs['XXmotif'].append([motif, []])
                        foundMotifsSeqs['XXmotif'][motif] = DD(list)
                    cm = []
                else:
                    cm.append(map(float, line.split()))
        seqs = []
        # parse sequence mappings
        with open(baseFname + '_sequence.txt') as XXmotifSequences:
            for i in xrange(4):
                XXmotifSequences.next()
            for line in XXmotifSequences:
                seqs.append(line.split('\t')[-1].strip())
        # parse instance locations
        with open(baseFname + '_Pvals.txt') as XXmotifPoss:
            for i in xrange(4):
                XXmotifPoss.next()
            mnum = 0
            for line in XXmotifPoss:
                if len(line) < 4:
                    continue
                if line[:6] == 'Motif ':
                    mnum = int(line.split()[1][:-1]) - 1
                else:
                    pos = int(line.split('\t')[4])
                    seq = seqs[int(line.split('\t')[3]) - 1]
                    foundMotifs['XXmotif'][mnum][1].append(pos)
                    foundMotifsSeqs['XXmotif'][foundMotifs['XXmotif'][mnum]
                                               [0]][seq].append(pos)
        for motif, poss in foundMotifs['XXmotif']:
            fout.write('\nXXmotif\t' + motif)
            for pos in poss:
                fout.write('\t' + str(pos))
    except IOError as e:
        print e, e.filename
        print "Error opening XXmotif's result file"
Пример #9
0
def reset_log():
    # TODO: make it such that if key is not in dict, it's init with incoming content
    # this way, we won't have to know in advance what we want to monitor
    return DD(list)
    logs = OD()
    for name in [
            'inner log p(x|z)', 'log p(x|z)', 'log p(x|z) nn', 'commit', 'vq',
            'kl', 'bpd', 'elbo'
    ]:
        logs[name] = []
    return logs
Пример #10
0
def parseDECOD(fout):
    try:
        with open(RES_DIR + "decod_found_motifs.txt", "r") as _decodResults:
            foundMotifs["DECOD"] = []
            foundMotifsSeqs["DECOD"] = {}
            alreadyFound = {}
            decodResults = _decodResults.readlines()
            lineNum = 0
            chars = ["A", "C", "G", "T"]
            for motif_num in xrange(int(config["DECOD"]["-nmotif"])):
                PWM = {}
                # parse PWM
                for line in decodResults[lineNum:]:
                    lineNum+=1
                    if len(line) > 1 and line[0] in chars:
                        PWM[line[0]] = line.strip("ACGT []\n").split()
                        if line[0] == "T":
                            break
                # construct motif from PWM
                motif = ""
                for i in xrange(len(PWM['A'])):
                    col = [PWM[x][i] for x in chars]
                    motif += chars[col.index(max(col))]
                
                foundMotifsSeqs["DECOD"][motif] = DD(list)
                # parse instances
                if not motif in alreadyFound:
                    fout.write("\nDECOD\t"+motif)
                    # find beginning of instances
                    while decodResults[lineNum][0] != ">":
                        lineNum+=1

                    # save instances
                    positions = []
                    while decodResults[lineNum][0] == ">":
                        line = decodResults[lineNum]
                        lineNum+=1
                        if "|revcom" in line:
                            continue
                        seqName = line[1:line.find("\t")]
                        seqName = seqName.strip()
                        pos = int(line.split("\t")[1])
                        foundMotifsSeqs["DECOD"][motif][seqName].append(pos)
                        pos = SEQ_LENS[seqName] - pos
                        positions.append(pos)
                        fout.write("\t" + str(pos))
                    foundMotifs["DECOD"] += [[motif, positions]]
                    alreadyFound[motif] = 0
    except IOError:
        print "Error opening DECOD's results file"
Пример #11
0
def Find_Matches(pattern, Suffa, Words):
    PS = pattern.split()
    #length = len(PS)
    length = len(pattern)

    index = []
    Returner = DD(list)
    ceiling_cost = (0.3 * len(pattern))
    for i in range(0, length):
        for j in range(length, i, -1):
            remain = length - j + 1
            current = pattern[i:j + 1]
            index = FIND_IN_SUFFIX_ARRAY(Suffa, Words, pattern[i:j + 1])
            Matches = [Match]

            if (len(index) > 1):
                for m in index:
                    if m.segment_id != -1:
                        pbfor = i
                        m.leftmin = abs(m.start - pbfor)

                        if m.leftmin == 0 and i > 0:
                            m.leftmin = 1

                        paftr = len(pattern[j + 1:])
                        m.rightmin = abs(m.remain - paftr)
                        #m.rightmin = abs(abs(m.remain) - abs(paftr))
                        if m.rightmin == 0 and remain > 0:
                            m.rightmin = 1

                        min_cost = m.leftmin + m.rightmin

                        m.pstart = i
                        m.pend = j + 1
                        m.leftmax = max(m.start, pbfor)
                        m.rightmax = max(m.remain, paftr)

                        if (min_cost <= ceiling_cost + 1):
                            #index.remove(m)
                            Returner[pattern[i:j + 1]].append(m)
                            break

                #if(len(index) > 1):
                #    print(pattern[i:j+1],len(index)-1,file = filep )#,file = open("test.txt","a"))

                #for v in index:
                #    Returner[pattern[i:j+1]].append(v)
                #break

    return Returner
Пример #12
0
def Filter_N_Gram(Matches):

    Copy = DD(list)
    Last = -1

    for all in Matches.keys():
        LS = Matches[all]
        LASTIN = LS[0].pstart
        if LASTIN != Last:

            for val in LS:
                Copy[all].append(val)

            Last = LASTIN

    return Copy
def CreateSuffixArrayDD(text):
    Suffix = DD(list)
    Words = text.split(sep=' ')
    Suff_Arr = []

    count = 0

    for i in Words:
        if i != '':
            Suffix[i[0]].append(count)
            count += 1

    SuffA = []
    for i in sorted(Suffix):
        SuffA.extend(SortAlpha(Suffix[i], Words))

    return (SuffA, Words)
Пример #14
0
def Find_Matches_NO_FILTER(pattern, Suffa, Words):
    PS = pattern.split()
    #length = len(PS)
    length = len(pattern)

    index = []
    Returner = DD(list)
    ceiling_cost = (0.3 * len(pattern))
    for i in range(0, length):
        for j in range(length, i, -1):
            remain = length - j + 1
            current = pattern[i:j + 1]
            index = FIND_IN_SUFFIX_ARRAY_MY_METHOD(Suffa, Words,
                                                   pattern[i:j + 1])
            Matches = [Match]

            if (len(index) > 1):
                for m in index:
                    Returner[pattern[i:j + 1]].append(m)
    return Returner
Пример #15
0
def conll_evaluate(l0_inputs, alphas, conll_eval_path, all_top_antecedent_scores):
    print("Compiling clusters and evaluators for conll suite")
    coref_predictions = [{} for _ in alphas]
    coref_evaluators = [metrics.CorefEvaluator() for _ in alphas]
    subtoken_maps = {}

    with open(l0_inputs, "rb") as f:
        data_dicts = np.load(f, allow_pickle=True).item().get("data_dicts")

    for example_num, data_dict in enumerate(tqdm(data_dicts)):
        example = data_dict["example"]
        subtoken_maps[example["doc_key"]] = example["subtoken_map"]
        top_span_starts = data_dict["top_span_starts"]
        top_span_ends = data_dict["top_span_ends"]
        top_antecedents = data_dict["top_antecedents"]

        for i in range(len(alphas)):
            top_antecedent_scores = all_top_antecedent_scores[example["doc_key"]][i]
            predicted_antecedents = get_predicted_antecedents(top_antecedents, top_antecedent_scores)
            coref_predictions[i][example["doc_key"]] = evaluate_coref(top_span_starts,
                top_span_ends, predicted_antecedents, example["clusters"], coref_evaluators[i])

    summary_dict = DD(list)
    for i in range(len(alphas)):
        print("\n*****************************")
        print("******* alpha = %f *******" % alphas[i])
        summary_dict["alpha"].append(alphas[i])
        conll_results = conll.evaluate_conll(conll_eval_path, coref_predictions[i], subtoken_maps, official_stdout=True)
        average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results)
        summary_dict["Average F1 (conll)"].append(average_f1)
        print("Average F1 (conll): {:.2f}%".format(average_f1))

        p,r,f = coref_evaluators[i].get_prf()
        summary_dict["Average F1 (py)"].append(f)
        print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(subtoken_maps.keys())))
        summary_dict["Average precision (py)"].append(p)
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["Average recall (py)"].append(r)
        print("Average recall (py): {:.2f}%".format(r * 100))

    return summary_dict
Пример #16
0
def parseWeeder(fout):
    try:
        with open(RES_DIR + get_filename(FILTERED_SEQ) + ".w2", "r") as _weederResults:
            foundMotifs["Weeder"] = []
            foundMotifsSeqs["Weeder"] = {}
            weederResults = _weederResults.readlines()[6:]
            for line in weederResults:
                if "Matrix" in line:
                    result = line.split()
                    motif = result[2]
                    fout.write("\nweeder\t"+result[2])
                    foundMotifs["Weeder"] += [[result[2], []]]
                    foundMotifsSeqs["Weeder"][motif] = DD(list)
                elif line[0] == ">":
                    seqName, mpos = line.split('\t')[0::4]
                    seqName = seqName.strip()
                    foundMotifs["Weeder"][-1][1].append(SEQ_LENS[seqName[1:]] - int(mpos))
                    foundMotifsSeqs["Weeder"][motif][seqName[1:]].append(int(mpos))
                    fout.write("\t"+str(SEQ_LENS[seqName[1:]] - int(mpos)))
    except IOError:
        print "Error opening Weeder's results file"
def Find_All_Matches(Suffa,Words,pattern):
    length = len(pattern)
    Matches = DD(list)
    ceiling_cost = (0.3 * len(pattern))
    for i in range(0,length):
            for j in range(length,i,-1) :
                search = pattern[i:j+1]
                ind = FIND_IN_SUFFIX_ARRAY(Suffa,Words,search)
                start = i
                end = j+1
                remain = len(pattern[j+1:])
                
                for M in ind:
                    if M.segment_id == -1:
                        break
                      
                    #min_cost = MatchFiltering(start,end,remain,MS)
                    M.leftmin = abs(M.start - start)
                    if M.leftmin == 0 and start > 0:
                        M.leftmin = 1
                    
                    M.rightmin = abs(M.remain - remain)
                    
                    if M.rightmin == 0 and remain > 0:
                        M.rightmin = 1
                    
                    min_cost = M.leftmin + M.rightmin
    
                    if min_cost <= ceiling_cost:                    
                        M.leftmax = max(M.start,start)
                        M.rightmax = max(M.remain,remain)
                        M.pstart = start
                        M.pend = end
                        Matches[(pattern[i:j+1])].append(M)



    return Matches
Пример #18
0
def main():
    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser(
        description='''Script to find out what words BERT\'s
        attention attends to in layers 0, 9, 10, 11.''')
    parser.add_argument('-i', '--src', type=argparse.FileType('r'), metavar='PATH',
        help='''File containing multiple lines of input text, if not specified,
              uses some predefined text.''')
    parser.add_argument('-t', '--top', type=int, default=10, metavar='K',
        help='Find top K words that BERT attends to. Default 10.')
    parser.add_argument('-b', '--batch_size', type=int, default=20, metavar='B',
        help='Specify batch size=B. Default 20.')
    parser.add_argument('-g', '--gpu', action='store_true',
        help='Option to use GPU.')
    parser.add_argument('-a', '--all_layers', action='store_true',
        help='Output the attention of each layer')
    parser.add_argument('-T', '--out_top', type=int, metavar='N',
        help='Output top N words in final output. If -a is set, default value is 100')
    parser.add_argument('-m', '--mask', action='store_true',
        help='Mask attended words and compare predictions with original words. Not functional yet.')
    parser.add_argument('-o', '--out', type=argparse.FileType('w'),
        help='File to write results', required=True)
    args = parser.parse_args()

    top_k = args.top
    batch_size = args.batch_size
    use_gpu = args.gpu
    do_mask = args.mask
    all_layers = args.all_layers
    out_top_k = args.out_top

    print('all_layers', all_layers)

    if args.src is not None:
        src = args.src
    else:
        text = 'burma has put five cities on a security alert after religious unrest involving buddhists and moslems in the northern city of mandalay , an informed source said wednesday.'
        text1 = 'police arrested five anti-nuclear protesters friday after they sought to disrupt loading of a french antarctic research and supply vessel , a spokesman for the protesters said .'
        text2 = 'turkmen president gurbanguly berdymukhammedov will begin a two-day visit to russia , his country \'s main energy partner , on monday for trade talks , the kremlin press office said .'
        text3 = 'israel \'s new government barred yasser arafat from flying to the west bank to meet with former prime minister shimon peres on thursday , a move palestinian officials said violated the israel-plo peace accords .'
        src = [text, text1, text2, text3]

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
    model.eval()

    if do_mask:
        mask_model = BertForMaskedLM.from_predtrained('bert-base-uncased')
        mask_model.eval()
        mask_token = tokenizer.mask_token
        mask_id = tokenizer.convert_tokens_to_ids([mask_token])[0]
        ignore_tokens = ['[CLS]', '[SEP]', '.', ',',
        'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
        'said', '#', 'here', '<', 'news', '>', '`',
        'has', 'have', 'will', 'the', 'a', 'is', 'was', 'are',
        'on', 'as',  'after', 'this', 'in', 'with', 'to']
        ignore_ids = tokenizer.convert_tokens_to_ids(ignore_tokens)
        if use_gpu:
            mask_model = mask_model.cuda()

    if use_gpu:
        assert torch.cuda.is_available(), 'GPU unavailable!'
        model = model.cuda()
        print('-- Using GPU --')
    device = torch.device('cuda') if use_gpu else torch.device('cpu')

    total_words_attended = 0
    total_start_time = time.time()

    # if batch_size is None:
    #     count = 0
    #     layers = [0, 9, 10, 11]
    #     for line in src:
    #         if count % 100 == 0:
    #             print('Processed', count, 'lines')
    #         count += 1
    #         line = line.strip()
    #         tokens_tensor = str_to_idx_tensor(tokenizer, line)
    #         if use_gpu:
    #             tokens_tensor = tokens_tensor.cuda()
    #         str_tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist()[0])
    #         outputs = model(tokens_tensor)
    #
    #         cum_attn = []
    #
    #         for l in layers:
    #             layer = outputs[2][l]
    #             summed = layer.sum(dim=2).sum(dim=1).view(-1)
    #             summed = (summed / summed.sum(dim=0))
    #             cum_attn.append(summed)
    #
    #         all_attns = torch.stack(cum_attn).sum(dim=0)
    #         normalized_attn = (all_attns / all_attns.sum(dim=0)).tolist()
    #         sorted_by_attn = sorted(list(zip(normalized_attn, str_tokens)), key=lambda p: p[0], reverse=True)
    #
    #         for p in sorted_by_attn[:top_k]:
    #             attended_word_dict[p[1]] += 1
    #
    #         total_words_attended += len(sorted_by_attn[:top_k])
    #     if args.src is not None:
    #         args.src.close()

    # batch implementation
    if all_layers:
        attended_word_tensor = torch.zeros(12, tokenizer.vocab_size,
            device=device)
    else:
        layers = torch.tensor([0], device=device)
        attended_word_tensor = torch.zeros(tokenizer.vocab_size, device=device)

    batch_iterator = chunked(src, batch_size)
    iter = 0

    for batch in batch_iterator:
        start_time = time.time()
        if iter % 10 == 0:
            print('Processed', iter, 'batches')
        iter += 1

        batch_toks, _, attn_mask, _ = batch_to_idx_tensor(tokenizer, batch)

        if use_gpu:
            batch_toks = batch_toks.cuda()
            attn_mask = attn_mask.cuda()

        with torch.no_grad():
            outputs = model(batch_toks, attention_mask=attn_mask)

        attn = torch.stack(outputs[2])
        # attn has shape [num_layers (12), batch_size, num_heads, max_src_len, max_src_len]
        if all_layers:
            summed = attn.sum(dim=3).sum(dim=2) # [num_layers, batch_size, max_src_len]
            summed = summed / summed.sum(dim=2, keepdim=True) # normalize
            summed.transpose_(0,1)
            # summed has shape [bath_size, num_layers, max_src_len]
        else:
            attn = attn.index_select(0, layers)
            summed = attn.sum(dim=3).sum(dim=2).sum(dim=0).view(attn.shape[1], attn.shape[4])
            summed = summed / summed.sum(dim=1, keepdim=True) # normalize
            # summed has shape [batch_size, max_src_len]

        _, topk_idxs = summed.topk(top_k, sorted=True)
        # topk_idxs has shape [batch_size, top_k]

        # split by each item in batch
        split_batch_toks = batch_toks.split(1, dim=0)
        split_topk_idxs = topk_idxs.split(1, dim=0)

        for idxs, toks in zip(split_topk_idxs, split_batch_toks):
            attended_toks = toks.squeeze(0)[idxs.squeeze(0)]
            if all_layers:
                # record topk attended tokens for each layer
                for i in range(summed.shape[1]):
                    attended_word_tensor[i, attended_toks[i]] += 1
            else:
                attended_word_tensor[attended_toks] += 1

        total_words_attended += topk_idxs.shape[0] * topk_idxs.shape[1]

    if args.src is not None:
        args.src.close()

    if all_layers:
        f = args.out
        f.write('Total tokens attended: {}\n '.format(total_words_attended))
        for i in range(attended_word_tensor.shape[0]):
            non_zero_idxs = attended_word_tensor[i].nonzero().view(-1)
            counts = attended_word_tensor[i, non_zero_idxs].tolist()
            toks = tokenizer.convert_ids_to_tokens(non_zero_idxs.tolist())
            attended_word_dict = DD(int)
            attended_word_dict.update(zip(toks, counts))
            top_attn_count = sorted(attended_word_dict.items(),key=lambda p: p[1],reverse=True)

            out_top_k = 300 if not out_top_k else out_top_k
            f.write('\nLAYER {}\n'.format(i))
            for p in top_attn_count[:out_top_k]:
                f.write(p[0] + ' ' + str(p[1]) + '\n')
        f.close()
        print('Finished, total duration = {:.4}'.format(time.time() - total_start_time))
    else:
        non_zero_idxs = attended_word_tensor.nonzero().view(-1)
        counts = attended_word_tensor[non_zero_idxs].tolist()
        toks = tokenizer.convert_ids_to_tokens(non_zero_idxs.tolist())
        attended_word_dict = DD(int)
        attended_word_dict.update(zip(toks, counts))
        # end if of batch implementation
        print('Finished, total duration = {:.4}'.format(time.time() - total_start_time))

        top_attn_count = sorted(attended_word_dict.items(),key=lambda p: p[1],reverse=True)

        f = args.out
        f.write('Total tokens attended: {}\n '.format(total_words_attended))
        if out_top_k is not None:
            top_attn_count = top_attn_count[:out_top_k]
        for p in top_attn_count:
            f.write(p[0] + ' ' + str(p[1]) + '\n')
        f.close()
Пример #19
0
    def run_auto(self):
        '''
        test direct data feature based transfer accuracy on the new building
        '''
        rf = RFC(n_estimators=100, criterion='entropy')
        rf.fit(self.train_fd, self.train_label)
        pred = rf.predict(self.test_fd)
        print('direct data feature-based transfer acc on tgt_bldg:',
              ACC(pred, self.test_label))
        #plot_confusion_matrix(self.test_label, pred)
        '''
        step1: train base models from bldg1
        '''
        self.get_base_learners()
        '''
        step2: TL with name feature on bldg2
        '''
        label = self.test_label
        class_ = np.unique(self.train_label)

        for b in self.bl:
            print(b.score(self.test_fd, label))

        n_class = 32
        c = KMeans(init='k-means++', n_clusters=n_class, n_init=10)
        c.fit(self.test_fn)
        dist = np.sort(c.transform(self.test_fn))
        ex_id = DD(list)  #example id for each C
        for i, j, k in zip(c.labels_, range(len(self.test_fn)), dist):
            ex_id[i].append(int(j))

        #getting neighors for each ex
        nb_c = DD()  #nb from clustering results
        for exx in ex_id.values():
            exx = np.asarray(exx)
            for e in exx:
                nb_c[e] = exx[exx != e]

        nb_f = [DD(), DD(), DD()]  #nb from classification results
        for b, n in zip(self.bl, nb_f):
            preds = b.predict(self.test_fd)
            ex_ = DD(list)
            for i, j in zip(preds, range(len(self.test_fd))):
                ex_[i].append(int(j))
            for exx in ex_.values():
                exx = np.asarray(exx)
                for e in exx:
                    n[e] = exx[exx != e]

        #use base learners' predicitons
        acc_ = []
        cov_ = []
        #for delta in np.linspace(0.1, 0.5, 5):
        for delta in np.linspace(self.agreement_threshold,
                                 self.agreement_threshold, 1):
            print('running TL with agreement threshold =', delta)

            labeled_id = []
            confidence = []
            output = DD()
            preds = np.array([999 for i in range(len(self.test_fd))])
            for i in range(len(self.test_fn)):
                #get the weight for each bl: by computing sim btw cluster and clf
                w = []
                v_c = set(nb_c[i])
                for n in nb_f:
                    v_f = set(n[i])
                    cns = len(v_c & v_f) / float(
                        len(v_c | v_f))  #original count based weight
                    #print (len(v_c & v_f) , len(v_c | v_f))
                    inter = v_c & v_f
                    union = v_c | v_f
                    d_i = 0
                    d_u = 0
                    for it in inter:
                        d_i += np.linalg.norm(self.test_fn[i] -
                                              self.test_fn[it])
                        #print (np.linalg.norm(self.test_fn[i]-self.test_fn[it]))
                    #input('...')
                    for u in union:
                        d_u += np.linalg.norm(self.test_fn[i] -
                                              self.test_fn[u])
                    if len(inter) != 0:
                        sim = 1 - (d_i / d_u) / cns
                        #sim = (d_i/d_u)/cns

                    if i in output:
                        output[i].extend(
                            ['%s/%s' % (len(inter), len(union)), 1 - sim])
                    else:
                        output[i] = [
                            '%s/%s' % (len(inter), len(union)), 1 - sim
                        ]
                    w.append(sim)
                output[i].append(np.mean(w))

                if np.mean(w) >= delta:
                    confidence.append(np.mean(w))
                    w[:] = [float(j) / sum(w) for j in w]
                    pred_pr = np.zeros(len(class_))
                    for wi, b in zip(w, self.bl):
                        pr = b.predict_proba(self.test_fd[i].reshape(1, -1))
                        pred_pr = pred_pr + wi * pr
                    preds[i] = class_[np.argmax(pred_pr)]
                    labeled_id.append(i)

            acc_.append(ACC(preds[preds != 999], label[preds != 999]))
            cov_.append(1.0 * len(preds[preds != 999]) / len(label))

        print('acc =', acc_, ';')
        print('cov =', cov_, ';')

        return preds[preds != 999], labeled_id, confidence
Пример #20
0
def voteRefine(sequences, motifs):
    #get probabilities
    lets = "ACGT"
    probability = DD(int)
    for seq in sequences:
        for let in sequences[seq]:
            probability[let] += 1
    s = sum(probability.values())
    for let in lets:
        probability[let] = probability[let] / float(s)

    #conductPoll
    poll = {}
    maxV = 0
    maxL = 0
    for seq in sequences:
        poll[seq] = [0.0] * len(sequences[seq])
        if len(sequences[seq]) > maxL:
            maxL = len(sequences[seq])

    for tool in motifs:
        for motif in motifs[tool]:
            for seq in motifs[tool][motif]:
                sequence = best(sequences, seq)
                for pos in motifs[tool][motif][seq]:
                    for i in xrange(pos, pos + len(motif)):
                        try:
                            # instead of weighting all results the same (1), we
                            # could bias based on tool or number of results or something like that
                            #poll[sequence][i - 1] += 1
                            if tool == "CMF":
                                poll[sequence][i - 1] += 1
                            if tool == "Weeder":
                                poll[sequence][i - 1] += 1
                            if tool == "MEME":
                                poll[sequence][i - 1] += 1
                            if tool == "DECOD":
                                poll[sequence][i - 1] += 1
                            if tool == "BioProspector":
                                poll[sequence][i - 1] += 1
                            if tool == "XXmotif":
                                poll[sequence][i - 1] += 1

                        except Exception as e:
                            print e
                            print 'It appears a tool has reported finding a motif',\
                                'outside the bounds of a sequence'
                            print 'such as finding a motif of length 10 at position',\
                                '195 in a sequence with length 200'
                            pdb.set_trace()

                        if poll[sequence][i - 1] > maxV:
                            maxV = poll[sequence][i - 1]
    #inspectPoll

    ress = []
    THRESH = 3.7
    maxInsts = 0

    MLEN = MOTIF_LEN
    for seq in poll:
        for i in xrange(len(poll[seq]) - MLEN):
            if sum(poll[seq][i:i + MLEN]) >= MLEN * THRESH:
                curr = sequences[seq][i:i + MLEN]
                bestPWM = None
                bestMatching = 0
                for PWM in ress:
                    matching = compMotifPWM(curr, PWM)
                    if matching > bestMatching and matching > MLEN / 2:
                        bestMatching = matching
                        bestPWM = PWM
                if bestPWM == None:
                    bestPWM = [[0, 0, 0, 0] for x in xrange(MLEN)]
                    ress.append(bestPWM)
                for c, col in zip(curr, bestPWM):
                    col[ALPH[c]] += 1
                insts = sum(bestPWM[0])
                if insts > maxInsts:
                    maxInsts = insts

    votedRess = DD(int)
    for PWM in ress:
        l = len(PWM)
        cons = PWMconsensus(PWM)
        for seq in sequences:
            for spos in xrange(0, len(sequences[seq]) - l):
                # .75% thresh
                if compMotifPWM(sequences[seq][spos:spos + l], PWM) >= .75 * l:
                    for pos in xrange(spos, spos + l):
                        votedRess[cons] += poll[seq][pos]

    return sorted(votedRess.iteritems(), key=lambda a: a[::-1])
Пример #21
0
 def __init__(self):
     self._errors = DD(list)
     self._workbook_fp = None
     self._days_ago = 0