Exemplo n.º 1
0
def get_deep_rest(datas, project, key):
    train_data = []
    new_msg_dict = Dict(lower=True)
    new_code_dict = Dict(lower=True)

    for p in datas.items():
        if p[0] == project: continue
        data = p[1]

        if key == 'dict':
            msg_dict, code_dict = data[key]
            for word in msg_dict.keys():
                new_msg_dict.add(word)
            for word in code_dict.keys():
                new_code_dict.add(word)
            continue

        if not train_data:
            for idx, elem in enumerate(data[key]):
                train_data.append([])
                train_data[-1].extend(elem)
        else:
            for idx, elem in enumerate(data[key]):
                train_data[idx].extend(elem)
    new_msg_dict = new_msg_dict.prune(100000)
    new_code_dict = new_code_dict.prune(100000)
    if key == 'dict':
        return [new_msg_dict.get_dict(), new_code_dict.get_dict()]
    return train_data
Exemplo n.º 2
0
def makeVocabulary(filename, size):
    "Construct the word and feature vocabs."
    vocab = Dict([
        Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD,
        Constants.EOS_WORD
    ],
                 lower=opt.lower)
    featuresVocabs = []
    with codecs.open(filename, "r", "utf-8") as f:
        for sent in f.readlines():
            words, features, numFeatures \
                = IO.extractFeatures(sent.split())

            if len(featuresVocabs) == 0 and numFeatures > 0:
                for j in range(numFeatures):
                    featuresVocabs.append(
                        Dict([
                            Constants.PAD_WORD, Constants.UNK_WORD,
                            Constants.BOS_WORD, Constants.EOS_WORD
                        ]))
            else:
                assert len(featuresVocabs) == numFeatures, \
                    "all sentences must have the same number of features"

            for i in range(len(words)):
                vocab.add(words[i])
                for j in range(numFeatures):
                    featuresVocabs[j].add(features[j][i])

    originalSize = vocab.size()
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab, featuresVocabs
Exemplo n.º 3
0
class Card(label):
    cardno = 1
    constant = Dict(g=9.8, dt=0.01)

    def __init__(self, dct):
        super(Card, self).__init__(**dct.__dict__)
        (self.force, self.text) = (vector(0.0, 0.0, 0.0), dct.text)
        (self.velocity, self.card) = (dct.velocity, dct.card)
        (self.msg) = (dct.text)
        Card.cardno += 1
        (self.dy, self.bounce) = (0, 0)
        self.fmt = "card%d: %%s" % (self.card)
        self.report()

    def report(self):
        self.text = self.fmt % (self.msg)

    def __call__(self, text=""):
        self.msg = text if text else self.msg
        if self.visible:
            dxyz = self.velocity * Card.constant.dt
            (self.pos, self.dy) = (self.pos + dxyz, dxyz.y)
            if self.y <= self.radius:
                if (abs(self.dy) > 1e-2):
                    self.velocity.y = abs(self.velocity.y) * self.keep
                    self.bounce += 1
                else:
                    self.velocity.y = 0.0
                    self.y = self.radius
            else:
                self.velocity.y = (self.velocity.y -
                                   Card.constant.g * Card.constant.dt)
            self.report()
Exemplo n.º 4
0
    def __init__(self,port=None):
        self.sock=socket.socket(socket.AF_INET,socket.SOCK_DGRAM)
        # self.sock.setsockopt(socket.SOL_SOCKET,socket.SO_REUSEPORT, 1)#so any socket can use this port
        if port is not None:
            self.sock.bind(('',port))

        #stuff for receiving
        #when received something, put it in the recvMsgsQ
        #so it can be procesed
        self.recvMsgsLock = threading.Lock()
        self.recvMsgsCV = threading.Condition(self.recvMsgsLock)
        self.recvMsgsQ = Queue.Queue()#a queue of messages available to be returned through recv function


        #array of packets that are sent and server is waiting for their ACKs
        self.pendingACKsPacketsLock = threading.Lock()
        self.pendingACKsPackets = []


        self.recvACKs  = Dict() #ACKs[addressPortTuple] = [list of ACKs of messages that are received and proccessed]
        #so when server gets a message it knows wether it was received before so it can safely ignore it

        self.ACKManager = ACKManager() #manages acks for messages sent

        self.__startUpWaitingForAcksThread()
        self.__startUpListeningThread()
Exemplo n.º 5
0
class Cut(list):

    default = Dict(bits=3, base=10, codepointBits=21, reverse=True)
    default(enum=1 << default.bits,
            need=((default.codepointBits / default.bits) +
                  int((default.codepointBits % default.bits) != 0)))
    default(mask=default.enum - 1, poss=default.need * default.bits)

    def __init__(self, **kw):
        self.the = Dict(**Cut.default)
        self.the.update(**kw)
        if self.the.bits != Cut.default['bits']:
            self.the(enum=1 << self.bits,
                     need=((self.the.codepointBits / self.the.bits) +
                           int((self.the.codepointBits % self.the.bits) != 0)))
            self.the(mask=enum - 1, poss=self.the.need * self.the.bits)
        # Initialize the list of lists
        self.append([-1] * self.the.enum)

    #def __call(self, codepoint, digit=-1):
    #cuts = self.cut(codepoint):
    #N = len(self)
    #if self[this][segment] == -1:
    #pass

    def cut(self, c):
        cuts = [(c >> (shft * self.the.bits)) & self.the.mask
                for shft in range(self.the.need)]
        if self.the.reverse:
            cuts.reverse()
        return cuts

    def insert(self, c):
        pass
Exemplo n.º 6
0
    def read_db(self, file_path):
        print("read_db")
        print(file_path)

        #--- open file
        with open(file_path, "rb") as fd:

            #--- header
            bheader = fd.read(Header.SIZE)
            header = Header(bheader)
            print(header)

            #--- dict
            fd.seek(header.dict_start)
            bdict = fd.read(header.dict_size)
            fdict = Dict(bdict)
            # for r in fdict.records_map.values():
            # 	print(r)

            #--- data
            # for r in fdict.records_map.values():
            # 	if r.pid == 0:
            # 		# print(r.fid, r.pid)
            # 		start = r.daddr
            # 		size = r.dsize
            # 		fd.seek(start)
            # 		bfile_data = fd.read(size)
            # 		record = DataRecord(bfile_data)
            # 		print(record)

            Store.reprint(fd, fdict, 0, 0)
Exemplo n.º 7
0
    def __init__(self):

        self.header = Header()
        self.fdata = Data()
        self.fdict = Dict()

        self.__fd = None
Exemplo n.º 8
0
    def read_db(self, file_path):
        print("read_db")
        print(file_path)

        #--- open file
        with open(file_path, "rb") as fd:

            #--- header
            bheader = fd.read(Header.SIZE)
            header = Header(bheader)
            print(header)

            #--- dict
            bdict = fd.read(header.dict_size)
            fdict = Dict(bdict)
            # for r in fdict.records_map.values():
            # 	print(r)
            # print(fdict)

            #--- data
            bdata = fd.read(header.data_size)
            fdata = Data(bdata)

            print()
            print("root files: ")
            for r in fdict.records_map.values():
                if r.pid == 0:
                    ff = fdata.get_record(r.daddr, r.dsize)
                    print(ff)
Exemplo n.º 9
0
def makeVocabulary(opt, name, filename, size):
    "Construct the word and feature vocabs."
    print("opt.lower: ", opt.lower)
    print(Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD)

    vocab = Dict([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD], lower=opt.lower)
    with codecs.open(filename, "r", "utf-8", errors='ignore') as f:
        for sent in f.readlines():
            if name == 'code':
                if opt.data_name.split('-')[1] == 'python':
                    words = python_tokenize(sent)
                elif opt.data_name.split('-')[1] == 'java':
                    words = java_tokenize(sent)
            elif name == 'comment':
                words = sent.split()
            for i in range(len(words)):
                vocab.add(words[i])

    originalSize = vocab.size()

    if size != 0:
        vocab = vocab.prune(size)
        print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize))
    else:
        print('Created dictionary of size %d' % (vocab.size()))

    return vocab
Exemplo n.º 10
0
    def __init__(self):
        self.corpus = list()  # save the corpus for training
        self.tag = list()  # the tag of corpus
        self.corpus_num = 0
        self.state = ['B', 'M', 'E', 'S']
        self.perceptron = MP()
        self.dict = Dict()
        self.init_prb = {'B': 0, 'M': 0, 'E': 0, 'S': 0}
        self.trans_prb = {
            'B': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'M': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'E': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'S': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            }
        }
        self.dimension = 0
        self.unigram_feat_num = 0
        self.unigram_feat_id = {}
        self.bigram_feat_num = 0
        self.bigram_feat_id = {}
        self.dict_feat_num = 0
        self.dict_feat_id = {}
        self.type_feat_num = 5**5
        self.path = r'./'

        self.nums = []
        self.dates = [u"年", u"月", u"日"]
        self.names = []
        inputs1 = codecs.open(r'./resources/Chinese_num.txt', 'r')
        for line in inputs1.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.nums.append(w)
        inputs1.close()
        inputs2 = codecs.open(r'./resources/names.txt', 'r')
        for line in inputs2.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.names.append(w)
        inputs2.close()
Exemplo n.º 11
0
    def make_db(self, scan_path, out_file):
        print("make_db")
        print(scan_path)
        print(out_file)

        header = Header()
        fdict = Dict()
        fdata = Data()

        self.__start_scan(scan_path, fdata)

        print(len(fdata.records))

        #--- make dict
        for record in fdata.records:
            fdict.append_record(record.fid, record.pid)

        print(len(fdict.records_map))

        #--- update header data
        header.dict_start = Header.SIZE
        header.dict_size = fdict.get_bdata_size()

        header.data_start = Header.SIZE + header.dict_size

        # fdata_bin = fdata.pack()
        # header.data_size = len(fdata_bin)

        #---
        fdata_bin = b""
        # fdata_start = header.data_start
        fdata_start = 0
        for record in fdata.records:
            rb = record.pack()

            #--- update dict addr
            fdict.set_addr(record.fid, fdata_start, len(rb))

            fdata_bin += rb
            fdata_start += len(rb)

        header.data_size = len(fdata_bin)

        # flat: 1496, gzip: 8182
        # gzip_fdata_bin = gzip.compress(fdata_bin,9)

        # print("-"*80)
        # print("flat: {:>4}, gzip: {:>4}".format(len(gzip_fdata_bin), len(fdata_bin)))
        # print("-"*80)

        #--- make binary
        bdata = b""
        bdata += header.pack()
        bdata += fdict.pack()
        bdata += fdata_bin

        #--- write file
        with open(out_file, "wb") as fd:
            fd.write(bdata)
Exemplo n.º 12
0
 def __init__(self, **kw):
     self.the = Dict(**Cut.default)
     self.the.update(**kw)
     if self.the.bits != Cut.default['bits']:
         self.the(enum=1 << self.bits,
                  need=((self.the.codepointBits / self.the.bits) +
                        int((self.the.codepointBits % self.the.bits) != 0)))
         self.the(mask=enum - 1, poss=self.the.need * self.the.bits)
     # Initialize the list of lists
     self.append([-1] * self.the.enum)
Exemplo n.º 13
0
def main():

    dicts = {}
    dicts['src'] = Dict()
    if opt.src_type == "text":
        dicts['src'], dicts['src_features'] = \
                initVocabulary('source', opt.train_src, opt.src_vocab,
                               opt.src_vocab_size)

    dicts['tgt'], dicts['tgt_features'] = \
        initVocabulary('target',
                       opt.train_tgt,
                       opt.tgt_vocab,
                       opt.tgt_vocab_size)

    print('Preparing training ...')
    train = {}
    train['src'], train['tgt'], \
        train['src_features'], train['tgt_features'], \
        train['alignments'] \
        = makeData(opt.train_src, opt.train_tgt,
                   dicts['src'], dicts['tgt'],
                   dicts['src_features'], dicts['tgt_features'])
    print('Preparing validation ...')
    valid = {}
    valid['src'], valid['tgt'], \
        valid['src_features'], valid['tgt_features'], \
        valid['alignments'] \
        = makeData(opt.valid_src, opt.valid_tgt,
                   dicts['src'], dicts['tgt'],
                   dicts['src_features'], dicts['tgt_features'])

    if opt.src_vocab is None:
        saveVocabulary('source', dicts['src'], opt.save_data + '.src.dict')
    if opt.tgt_vocab is None:
        saveVocabulary('target', dicts['tgt'], opt.save_data + '.tgt.dict')
    if opt.features_vocabs_prefix:
        saveFeaturesVocabularies('source', dicts['src_features'],
                                 opt.save_data)
        saveFeaturesVocabularies('target', dicts['tgt_features'],
                                 opt.save_data)

    print('Saving data to \'' + opt.save_data + '.train.pt\'...')
    save_data = {
        'dicts': dicts,
        'type': opt.src_type,
        'train': train,
        'valid': valid
    }
    # torch.save(save_data, opt.save_data + '.train.pt')
    with open(opt.save_data + '.train.pt', 'wb') as fwrite:
        pickle.dump(save_data, fwrite)
Exemplo n.º 14
0
 def load(path):
     items = []
     #path = utils.format_filename(path)
     name = os.path.basename(path)
     if os.path.isdir(path):
         for i in os.listdir(path):
             p = os.path.join(path, i)
             if os.path.isdir(os.path.join(path, i)):
                 items.append(Container.load(p))
             else:
                 items.append(Dict.load(p))
         return Container(name, items)
     else:
         return None
Exemplo n.º 15
0
 def __init__(self, **kw):
     argv = Dict(**kw)
     for arg in argv.arg:
         with open(arg) as source:
             soup = BeautifulSoup(source)
             print soup.prettify()
     """
     name = []
     self.section = []
     self.info = {}
     for arg in argv.arg:
         with open(arg) as source:
             self.lines = []
             for line in source.readlines():
                 line = line.strip()
                 if line and not line.startswith('#'):
                     self.lines += [line, ]
             for line in self.lines:
                 num = 0
                 while line[0] == '=' and line[-1] == '=':
                     num += 1
                     line = line[1:-1]
                 line = line.strip()
                 if num:
                     index = num - 1
                     N = len(self.section)
                     if N < num:
                         self.section += ['', ]
                         N += 1
                     assert N >= num, 'section jump > 1 forbidden'
                     self.section = self.section[:num]
                     self.section[index] = line
                 info = self.info
                 for section in self.section:
                     info[section] = self.info.get(section, {})
                     info = info[section]
                 if not num:
                     if info.has_key('text'):
                         info['text'] += '\n' + line
                     else:
                         info['text'] = line
                     print info['text']
                     print
                     #info['text'] = info.get('text', '') + '\n' + line
                 #print 'info: ', self.info
             pprint(self.info)
             print arg
     """
     print self
Exemplo n.º 16
0
def makeVocabulary(sentences, size=1000000, min_freq=None):
    vocab = Dict([
        Constants.PAD_WORD, Constants.UNK_WORD, Constants.EOS_WORD,
        Constants.BOS_WORD
    ])
    for sentence in sentences:
        for word in sentence.split():
            vocab.add(word)
    originalSize = vocab.size()
    if min_freq is not None:
        vocab = vocab.prune_by_freq(min_freq)
    else:
        vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))
    return vocab
Exemplo n.º 17
0
def initVocabulary(name, dataFile, vocabFile, vocabSize):
    """If `vocabFile` exists, read it in,
    Else, generate from data."""
    vocab = None
    if vocabFile is not None:
        # If given, load existing word dictionary.
        print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...')
        vocab = Dict()
        vocab.loadFile(vocabFile)
        print('Loaded ' + str(vocab.size()) + ' ' + name + ' words')

    if vocab is None:
        # If a dictionary is still missing, generate it.
        print('Building ' + name + ' vocabulary...')
        genWordVocab, genFeaturesVocabs = makeVocabulary(dataFile, vocabSize)
        vocab = genWordVocab
        featuresVocabs = genFeaturesVocabs

    print()
    return vocab, featuresVocabs
Exemplo n.º 18
0
    def main():
        "Entrypoint for local testing and use"
        arg = Dict(**docopt(__doc__, version=__version__))
        if arg.verbose:
            pprint(arg)

        if arg.test:
            test()
        else:
            unidigit = Digit(ingest=True, unique=False)
            if arg.javascript:
                unidigit.emit()

            for string in arg.INTEGER:
                value = 0
                for codepoint in string.decode('utf-8'):
                    ordinal = ord(codepoint)
                    print '%s %06x' % (codepoint, ordinal),
                    value = value * 10 + unidigit(ordinal)
                print value
    def prepareData(self):
        #read train data
        print("Reading Training Data...")
        pairs = self.readInput(self.train_file)
        print("Read %s sentence pairs" % len(pairs))

        if len(self.dev_file) == 0:
            num_train = int(len(pairs) * 0.1)
            random.shuffle(pairs)
            train_pairs = pairs[:num_train]
            dev_pairs = pairs[num_train:]
        else:
            train_pairs = pairs
            print("Reading Development Data...")
            dev_pairs = self.readInput(self.dev_file)
            print("Read %s sentence pairs" % len(dev_pair1s))

        print("Reading Testing Data...")
        test_pairs = self.readInput(self.test_file)
        print("Read %s sentence pairs" % len(test_pairs))

        self.countClassInvFreq(train_pairs)
        train_pairs = self.trimSents(train_pairs)
        self.en_dict = Dict()
        for pair in train_pairs:
            self.en_dict.addSentence(pair[0])

        print(
            "Number of words before removing word frequency below threshold: %d"
            % self.en_dict.n_words)
        self.en_dict.removeLowFreqWords(self.freq_threshold)
        print("Number of words after threshold: %d" % self.en_dict.n_words)

        train = Data(self.stringToIndex(train_pairs), self.max_batch_size)
        dev = Data(self.stringToIndex(dev_pairs), self.max_batch_size)
        test = Data(self.stringToIndex(test_pairs), self.max_batch_size)

        return self.en_dict, train, dev, test
Exemplo n.º 20
0
def getCards(f, configname='card3d.cfg'):
    config = ConfigParser()
    config.read(configname)
    config.card3dname = configname
    cards = {}
    dct = Dict(velocity=vector(0, -1, 0),
               color=(1, 1, 1),
               background=(0.25, 0.25, 0.25),
               opacity=0.2,
               keep=1.0,
               frame=f,
               visible=True)
    for num, (key, val) in enumerate(config._sections.iteritems()):
        if not key.startswith('card'):
            continue
        dig = key[4:]
        if not dig.isdigit():
            continue
        dig = int(dig)
        line = val['text']
        (dct.pos, dct.card, dct.text) = ((num % 4, 4, num % 4), dig, line)
        dct.radius = 1
        cards[key] = Card(dct)
    return (config, cards)
Exemplo n.º 21
0
 def test_key(self):
     d = Dict()
     d["a"] = 1
     self.assertTrue(d.a == 1)
Exemplo n.º 22
0
 def test_attrerror(self):
     d = Dict()
     with self.assertRaises(AttributeError):
         value = d.empty
Exemplo n.º 23
0
 def test_keyerror(self):
     d = Dict()
     with self.assertRaises(KeyError):
         value = d['empty']
Exemplo n.º 24
0
 def test_init(self):
     d = Dict(a=1,b='test')
     self.assertEqual(d.a,1);
     self.assertEqual(d.b,'test')
     self.assertTrue(isinstance(d,dict))
Exemplo n.º 25
0
 def test_attr(self):
     d = Dict()
     d.key = "value"
     self.assertTrue('key' in d)
     self.assertTrue(d['key'] == "value")
Exemplo n.º 26
0
 def test_attrError(self):
     d = Dict()
     with self.assertRaises(AttributeError):
         key = d.empty
Exemplo n.º 27
0
def get_model_data(args):
    pjdb = client[args.project]
    ppcommits_db = pjdb[args.preprocess_commit]

    create_dir(args.save_path)

    ids, labels, msgs, codes, deepjit_codes, deepjit_raw_codes = [], [], [], [], [], []

    after = datetime.datetime.strptime(args.after,"%Y-%m-%d").timestamp()
    before = datetime.datetime.strptime(args.before,"%Y-%m-%d").timestamp()
    ppcommits_db.create_index('commit_date')

    msg_dict = Dict(lower=True)
    code_dict = Dict(lower=True)

    for commit in tqdm(ppcommits_db.find({"commit_date": {'$gte': after, '$lte': before}, 
                                          "median_issue": 0}).sort([('commit_date',1)])):

        label = 0 if commit['bug_count'] == 0 else 1

        commit_id = commit['commit_id']

        msg = commit['commit_msg'].strip()
        msg = split_sentence(msg)
        msg = ' '.join(msg.split(' ')).lower()

        for word in msg.split():
            msg_dict.add(word)

        format_code = []
        files_code = []
        raw_code = []
        for diff_file in commit['files_diff']:
            diff = commit['files_diff'][diff_file]
            added_code, removed_code, file_codes = [], [], []

            for line in diff['a']:
                # if len(diff['a'][line]['code'].split()) > 3:
                remove_code = diff['a'][line]['code'].strip()
                remove_code = ' '.join(split_sentence(remove_code).split())
                remove_code = ' '.join(remove_code.split(' '))
                removed_code.append(remove_code)
                for word in remove_code.split():
                    code_dict.add(word)
                # remove_code = 'removed _ code'
                file_codes.append((line, remove_code))
                if len(removed_code) > 10: break

            for line in diff['b']:
                # if len(diff['b'][line]['code'].split()) > 3:
                add_code = diff['b'][line]['code'].strip()
                add_code = ' '.join(split_sentence(add_code).split())
                add_code = ' '.join(add_code.split(' '))
                added_code.append(add_code)
                for word in add_code.split():
                    code_dict.add(word)
                # add_code = 'added _ code'
                file_codes.append((line, add_code))
                if len(added_code) > 10: break

            file_codes.sort(key=lambda x: x[0])
            raw_code.extend([code[1] for code in file_codes])
            raw_code = raw_code[:10]
            format_code.append("added _ code removed _ code")
            files_code.append({'added_code': added_code, 'removed_code': removed_code})
            # shuffle(code)

            if len(format_code) == 10: break

        # if len(format_code) == 0:
        #     continue

        ids.append(commit_id)
        labels.append(label)
        msgs.append(msg)
        deepjit_codes.append(format_code)
        deepjit_raw_codes.append(raw_code)
        codes.append(files_code)
        
    train_ids, test_ids = split_data(args, ids)
    train_labels, test_labels = split_data(args, labels)
    train_msgs, test_msgs = split_data(args, msgs)
    deepjit_train_codes, deepjit_test_codes = split_data(args, deepjit_codes)
    deepjit_train_raw_codes, deepjit_test_raw_codes = split_data(args, deepjit_raw_codes)
    train_codes, test_codes = split_data(args, codes)


    deepjit_train_data = [train_ids, train_labels,
                          train_msgs, deepjit_train_codes]
    deepjit_train_raw_data = [train_ids, train_labels,
                          train_msgs, deepjit_train_raw_codes]
    deepjit_test_data = [test_ids, test_labels, test_msgs, deepjit_test_codes]
    deepjit_test_raw_data = [test_ids, test_labels, test_msgs, deepjit_test_raw_codes]

    deepjit_all_data = [ids, labels, msgs, deepjit_codes]
    deepjit_all_raw_data = [ids, labels, msgs, deepjit_raw_codes]


    cc2vec_train_data = [train_ids, train_labels, train_msgs, train_codes]
    cc2vec_test_data = [test_ids, test_labels, test_msgs, test_codes]

    cc2vec_all_data = [ids, labels, msgs, codes]

    dextend_train_data = [train_ids, train_labels,
                          train_msgs, deepjit_train_codes]
    dextend_test_data = [test_ids, test_labels, test_msgs, deepjit_test_codes]

    dextend_all_data = [ids, labels, msgs, deepjit_codes]

    raw_dextend_train_data = [train_ids, train_labels,
                          train_msgs, deepjit_train_raw_codes]
    raw_dextend_test_data = [test_ids, test_labels, test_msgs, deepjit_test_raw_codes]

    raw_dextend_all_data = [ids, labels, msgs, deepjit_raw_codes]
    
    with open('{}./deepjit/{}_train.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(deepjit_train_data, f)
    with open('{}./deepjit/{}_test.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(deepjit_test_data, f)
    with open('{}./deepjit/{}_all.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(deepjit_all_data, f)

    with open('{}./deepjit/{}_train_raw.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(deepjit_train_raw_data, f)
    with open('{}./deepjit/{}_test_raw.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(deepjit_test_raw_data, f)
    with open('{}./deepjit/{}_all_raw.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(deepjit_all_raw_data, f)

    with open('{}./cc2vec/{}_train.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(cc2vec_train_data, f)
    with open('{}./cc2vec/{}_test.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(cc2vec_test_data, f)
    with open('{}./cc2vec/{}_all.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(cc2vec_all_data, f)

    with open('{}./cc2vec/{}_train_dextend.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(dextend_train_data, f)
    with open('{}./cc2vec/{}_test_dextend.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(dextend_test_data, f)
    with open('{}./cc2vec/{}_all_dextend.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(dextend_all_data, f)

    with open('{}./cc2vec/{}_train_dextend_raw.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(raw_dextend_train_data, f)
    with open('{}./cc2vec/{}_test_dextend_raw.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(raw_dextend_test_data, f)
    with open('{}./cc2vec/{}_all_dextend_raw.pkl'.format(args.save_path, args.project), 'wb') as f:
        pickle.dump(raw_dextend_all_data, f)

    msg_dict = msg_dict.prune(100000)
    code_dict = code_dict.prune(100000)

    project_dict = [msg_dict.get_dict(), code_dict.get_dict()]

    pickle.dump(project_dict, open("{}/{}_dict.pkl".format(args.save_path, args.project), 'wb'))
    pickle.dump(project_dict, open("{}/deepjit/{}_dict.pkl".format(args.save_path, args.project), 'wb'))
    pickle.dump(project_dict, open("{}/cc2vec/{}_dict.pkl".format(args.save_path, args.project), 'wb'))
    pickle.dump(project_dict, open("{}/feature/{}_dict.pkl".format(args.save_path, args.project), 'wb'))

    print('Train data size: {}, Bug size: {}'.format(
        len(train_labels), sum(train_labels)))
    print('Test data size: {}, Bug size: {}'.format(
        len(test_labels), sum(test_labels)))
Exemplo n.º 28
0
 def test_key(self):
     d = Dict()
     d['key'] = 'value'
     self.assertEqual(d.key,'value')
Exemplo n.º 29
0
 def test_init(self):
     d = Dict(a=1, b="test")
     self.assertEqual(d.a, 1)
     self.assertEqual(d.b, "test")
     self.assertTrue(isinstance(d, dict))
Exemplo n.º 30
0
 def test_attr(self):
     d = Dict()
     d.key = 'value'
     self.assertTrue('key' in d)
     self.assertEqual(d['key'],'value')