Пример #1
0
def get_pred_ref(input_file):

    import tokenizer

    prep('preparing predictions list... ')
    preds = dict()
    predicts = open(input_file, 'r')
    for c, line in enumerate(predicts):
        (fid, pred) = line.split('\t')
        fid = int(fid)
        pred = pred.split()
        pred = fil(pred)
        preds[fid] = pred
    predicts.close()
    drop()

    re_0001_ = re.compile(r'([^a-zA-Z0-9 ])|([a-z0-9_][A-Z])'
                          )  # not sure what this is ? vocabulary?

    refs = dict()
    newpreds = dict()
    d = 0
    targets = open('%s/coms.test' % (dataprep), 'r')
    for line in targets:
        (fid, com) = line.split(',')
        fid = int(fid)
        com = com.split()
        com = fil(com)

        try:
            newpreds[fid] = preds[fid]
        except KeyError as ex:
            continue

        refs[fid] = [com]

    return newpreds, refs
Пример #2
0
    outfile = args.outfile
    zerodats = args.zerodats
    datfile = args.datfile
    testval = args.testval

    if outfile is None:
        outfile = modelfile.split('/')[-1]

    K.set_floatx(args.dtype)
    os.environ['CUDA_VISIBLE_DEVICES'] = gpu
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = args.tf_loglevel

    sys.path.append(dataprep)
    import tokenizer

    prep('loading tokenizers... ')
    tdatstok = pickle.load(open('%s/tdats.tok' % (dataprep), 'rb'), encoding='UTF-8')
    comstok = pickle.load(open('%s/coms.tok' % (dataprep), 'rb'), encoding='UTF-8')
    smltok = pickle.load(open('%s/smls.tok' % (dataprep), 'rb'), encoding='UTF-8')
    drop()

    prep('loading sequences... ')
    seqdata = pickle.load(open('%s/%s' % (dataprep, datfile), 'rb'))
    drop()

    print(zerodats)
    if zerodats == 'yes':
        zerodats = True
    else:
        zerodats = False
    print(zerodats)
Пример #3
0
    dataprep = args.dataprep
    gpu = args.gpu
    batch_size = args.batch_size
    epochs = args.epochs
    modeltype = args.modeltype
    multigpu = args.multigpu

    K.set_floatx(args.dtype)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = args.tf_loglevel

    sys.path.append(dataprep)
    import tokenizer

    init_tf(gpu)

    prep('loading tokenizers... ')
    tdatstok = pickle.load(open('%s/tdats.tok' % (dataprep), 'rb'),
                           encoding='UTF-8')
    sdatstok = pickle.load(open('%s/sdats.tok' % (dataprep), 'rb'),
                           encoding='UTF-8')
    comstok = pickle.load(open('%s/coms.tok' % (dataprep), 'rb'),
                          encoding='UTF-8')
    smltok = pickle.load(open('%s/smls.tok' % (dataprep), 'rb'),
                         encoding='UTF-8')
    drop()

    prep('loading sequences... ')
    seqdata = pickle.load(open('%s/dataset.pkl' % (dataprep), 'rb'))
    drop()

    steps = int(len(seqdata['ctrain']) / batch_size) + 1
Пример #4
0
    if obfuscate:
        dataprep = '../data/obfuscation/output'

    if sbt:
        dataprep = '../data/sbt/output'

    if input_file is None:
        print('Please provide an input file to test with --input')
        exit()


    sys.path.append(dataprep)
    import tokenizer

    prep('preparing predictions list... ')
    preds = dict()
    predicts = open(input_file, 'r')
    for c, line in enumerate(predicts):
        (fid, pred) = line.split('\t')
        fid = int(fid)
        pred = pred.split()
        pred = fil(pred)
        preds[fid] = pred
    predicts.close()
    drop()

    re_0001_ = re.compile(r'([^a-zA-Z0-9 ])|([a-z0-9_][A-Z])')

    refs = list()
    newpreds = list()
Пример #5
0
    regex = re.compile('|'.join(map(re.escape, substrings)))
    return regex.sub(lambda match: substitutions[match.group(0)], string)

def qID():
    global qid
    qid +=1
    return qid
def aID():
    global aid
    aid +=1
    return aid

datasetloc = 'srcmldat'

# loading srcml to aid finding the elements in code
prep('loading srcmlunits... ')
srcmlunits = pickle.load(open(datasetloc + '/srcml-standard.pkl', 'rb'))
sml2 = pickle.load(open(datasetloc + '/srcml-final-allcoms.pkl', 'rb'))

for key, val in sml2.items():
    srcmlunits[key] = val

drop()

class MyHTMLParser(HTMLParser):
    def __init__(self):
        super(MyHTMLParser, self).__init__()
        self.parentstack = list()
        self.qasynth = dict()
        self.qasynth2 = dict()
        self.dataseq = list()
Пример #6
0
import networkx as nx
import re
import statistics
import numpy as np

def load_good_fid():
    filename = './output/dataset.coms'
    good_fid = []
    for line in open(filename):
        tmp = [x.strip() for x in line.split(',')]
        fid = int(tmp[0])
        good_fid.append(fid)

    return good_fid

prep('loading srcmlunits... ')
srcmlunits = pickle.load(open('srcml-standard.pkl', 'rb'))
sml2 = pickle.load(open('fundatsparsed-srcml-final-allcoms.pkl', 'rb'))

for key, val in sml2.items():
    srcmlunits[key] = val

drop()

def re_0002(i):
    # split camel case and remove special characters
    tmp = i.group(0)
    if len(tmp) > 1:
        if tmp.startswith(' '):
            return tmp
        else:
Пример #7
0
import math
import traceback
import argparse
import signal
import atexit
import time

import random
import tensorflow as tf
import numpy as np

import networkx as nx

from myutils import prep, drop

prep('loading sequences... ')
seqdata = pickle.load(open('/nfs/projects/attn-to-fc/data/standard_3dfiles_graphast/dataset.pkl', 'rb'))
drop()

#fid=122380

#print(wsmlnodes)
#print(wsmledges)

def idx2tok(nodelist, path):
    out = list()
    for idx in path:
        out.append(nodelist[idx])
    return out

# one way