def main(argc, argv):
    if argc < 6:
        print(
            'Usage:%s <data> <char_vob> <train_output> <test_output> <output_type>'
            % (argv[0]))
        exit(1)

    char_vob = w2v.Word2vecVocab()
    char_vob.Load(argv[2])

    word_vob = w2v.Word2vecVocab()
    word_vob.Load('words_vec_100.txt')

    train_out = open(argv[3], 'w')
    test_out = open(argv[4], 'w')

    with open(argv[1], 'r') as f:
        csv_reader = csv.reader(f, delimiter=',')
        data = [row for row in csv_reader]
        stat_max_len(data)
        train_data, test_data = build_dataset(data)
        processLine(train_out, argv[5], train_data, char_vob, word_vob)
        processLine(test_out, argv[5], test_data, char_vob, word_vob)

    train_out.close()
    test_out.close()
Пример #2
0
def main(argc, argv):
    global totalLine
    global longLine
    global totalChars
    if argc < 6:
        print("Usage:%s <word_vob> <char_vob> <pos_vob>  <dir> <output>" %
              (argv[0]))
        sys.exit(1)
    wvobPath = argv[1]
    cvobpath = argv[2]
    pvobPath = argv[3]
    rootDir = argv[4]
    word_vob = w2v.Word2vecVocab()
    word_vob.Load(wvobPath)
    char_vob = w2v.Word2vecVocab()
    char_vob.Load(cvobpath)
    posVob = {}
    loadPosVob(pvobPath, posVob)
    out = open(argv[5], "w")
    for dirName, subdirList, fileList in os.walk(rootDir):
        curDir = os.path.join(rootDir, dirName)
        for file in fileList:
            if file.endswith(".txt"):
                curFile = os.path.join(curDir, file)
                #print("processing:%s" % (curFile))
                fp = open(curFile, "r")
                for line in fp.readlines():
                    line = line.strip()
                    processLine(line, out, word_vob, char_vob, posVob)
                fp.close()
    out.close()
    print("total:%d, long lines:%d, chars:%d" %
          (totalLine, longLine, totalChars))
Пример #3
0
def main(argc, argv):
	global totalLine
	global longLine
	global totalChars
	if argc < 5:
		print("Usage:%s <vec_vob> <tag_vob> <corpus> <output>" %
					(argv[0]))
		sys.exit(1)
	# wvobPath = argv[1]
	cvobpath = argv[1]
	pvobPath = argv[2]
	corpusPath = argv[3]
	vec_vob = w2v.Word2vecVocab()
	vec_vob.Load(cvobpath)
	tagVob = {}
	loadtagVob(pvobPath, tagVob)
	out = open(argv[4], "w")
	with open(corpusPath, 'r') as fp:
		all_text = fp.readlines()
		file_len = len(all_text)
		for count, line in enumerate(all_text):
			line = line.strip()

			if count % 1000 == 0:
				print(count, file_len)
			processLine(line, out, vec_vob, tagVob)

	out.close()
	print("total:%d, long lines:%d, chars:%d" %
				(totalLine, longLine, totalChars))

	split_train_testing(argv[4])
Пример #4
0
def main(argc, argv):
  if argc < 3:
    print("Usage:%s <word2vec_vocab_path> <output_path>" % (argv[0]))
    sys.exit(1)
  vob = w2v.Word2vecVocab()
  vob.Load(argv[1])
  vob.DumpBasicVocab(argv[2])
Пример #5
0
def main(argc, argv):
    global totalLine
    global longLine
    global totalChars
    if argc < 4:
        print("Usage:%s <vob> <dir> <output>" % (argv[0]))
        sys.exit(1)
    vobPath = argv[1]
    rootDir = argv[2]
    vob = w2v.Word2vecVocab()
    vob.Load(vobPath)
    out = open(argv[3], "w")
    for dirName, subdirList, fileList in os.walk(rootDir):
        for file in fileList:
            if file.endswith(".txt"):
                curFile = os.path.join(dirName, file)
                #print("processing:%s" % (curFile))
                fp = open(curFile, "r")
                for line in fp.readlines():
                    line = line.strip()
                    processLine(line, out, vob)
                fp.close()
    out.close()
    print("total:%d, long lines:%d, chars:%d" %
          (totalLine, longLine, totalChars))
Пример #6
0
def main(argc, argv):
    if argc < 4:
        print("Usage: %s <input>  <output> <vec>" % (argv[0]))
        sys.exit(1)
    vob = w2v.Word2vecVocab()
    vob.Load(argv[3])
    inp = open(argv[1], "r")
    oup = open(argv[2], "w")
    processFile(inp, oup, vob)
Пример #7
0
def main(argc, argv):
    if argc < 5:
        print('Usage:%s <data_dir> <char_vob> <output_type> <output_name>' %
              (argv[0]))
        exit(1)

    char_vob = w2v.Word2vecVocab()
    char_vob.Load(argv[2])

    process_data(argv[1], char_vob, int(argv[3]), argv[4])
Пример #8
0
def main(argc, argv):
	if argc != 6:
		print('Usage:{} <vec_vob> <char_vob> <tag_vob> <corpus> <output>'.format(argv[0]))
		sys.exit(1)
	wvobPath = argv[1]
	cvobPath = argv[2]
	tagvobPath = argv[3]
	corpusPath = argv[4]
	outputPath = argv[5]
	'''load word2vector'''
	word_vob = w2v.Word2vecVocab()
	char_vob = w2v.Word2vecVocab()
	word_vob.Load(wvobPath)
	char_vob.Load(cvobPath)
	tag_vob = loadtagVob(tagvobPath)

	generate_train = Generate_train(char_vob, word_vob, tag_vob)
	processed_lines_gen = generate_train.process(corpusPath)
	print('total line:{}'.format(generate_train.total_line))
	write_train_data(outputPath, processed_lines_gen, generate_train.total_line)
Пример #9
0
def main(argc, argv):
    if argc < 6:
        print(
            "Usage:%s <data> <word_vob> <char_vob> <train_output> <test_output>"
            % (argv[0]))

    train_output = open(argv[4], "w")
    test_output = open(argv[5], "w")

    word_vob = w2v.Word2vecVocab()
    word_vob.Load(argv[2])
    char_vob = w2v.Word2vecVocab()
    char_vob.Load(argv[3])
    with open(argv[1], 'r') as f:
        data = json.load(f)
        stat_max_len(data)
        train_data, test_data = build_dataset(data)
        generate_net_input(train_data, train_output, word_vob, char_vob)
        generate_net_input(test_data, test_output, word_vob, char_vob)

    train_output.close()
    test_output.close()
Пример #10
0
def main(argc, argv):
    if argc < 3:
        print(
            "Usage: %s <input>  <output> [model | 0 for w2v , 1 for training]  [vec_path | if mode if not 0]"
            % (argv[0]))
        sys.exit(1)
    mode = 0
    vob = None
    if argc > 4:
        mode = int(argv[3])
        vob = w2v.Word2vecVocab()
        vob.Load(argv[4])
    inp = open(argv[1], "r")
    oup = open(argv[2], "w")
    processFile(inp, oup, mode, vob)
Пример #11
0
def doGen(inputPath, outputPath, vocabPath):
    global totalLine
    global longLine
    global totalChars
    vob = w2v.Word2vecVocab()
    vob.Load(vocabPath)
    with open(inputPath, "r") as inp:
        with open(outputPath, "w") as out:
            for line in inp.readlines():
                line = line.strip()
                if not line:
                    continue
                processLine(line, vob, out)
    print("total:%d, long lines:%d, chars:%d" %
          (totalLine, longLine, totalChars))
Пример #12
0
def convert(trainPath,
            trainOutPath,
            testOutPath,
            vocabPath,
            titleVobPath,
            partFrom=0,
            partEnd=9,
            testRatio=0.02):
    vocab = w2v.Word2vecVocab()
    vocab.Load(vocabPath)
    writerTrain = tf.python_io.TFRecordWriter(trainOutPath)
    writerTest = tf.python_io.TFRecordWriter(testOutPath)
    raseg.init_config("/var/local/seg/conf/qsegconf.ini")
    seg = raseg.ImTokenizer()
    titleVocab = {}
    load_title_dict(titleVobPath, titleVocab)
    numTag = len(titleVocab) + 1
    npos = 0
    nneg = 0
    processed = 0

    for i in range(partFrom, partEnd + 1):
        with open("%s/part-r-%05d" % (trainPath, i), "r") as fp:
            for line in fp.readlines():
                line = line.strip()
                if not line:
                    continue
                processed += 1
                ss = line.split("\t")
                assert (len(ss) == 8)
                title = ss[0].lower()
                if title == '网络':
                    title = '网络工程师'
                if title not in titleVocab:
                    print("[%s] not there!! " % (title))
                    continue

                target = titleVocab[title]
                target_orgId = int(ss[1])
                gender = int(ss[2])
                age = int(ss[3])
                location = int(ss[4])
                edustrs = ss[5].split(" ")
                assert (len(edustrs) == 12)
                edu_expr1 = EduExperience(edustrs, 0)
                edu_expr2 = EduExperience(edustrs, 1)
                edu_expr3 = EduExperience(edustrs, 2)

                workstrs = ss[6].split(" ")
                assert (len(workstrs) == 18)
                work_expr1 = WorkExperience(workstrs, 0)
                workTokens = gen_sentence_features(work_expr1.desc, vocab, seg)
                work_expr2 = WorkExperience(workstrs, 1)
                workTokens += gen_sentence_features(work_expr2.desc, vocab,
                                                    seg)
                work_expr3 = WorkExperience(workstrs, 2)
                workTokens += gen_sentence_features(work_expr3.desc, vocab,
                                                    seg)

                projstrs = ss[7].split(" ")
                assert (len(projstrs) == 3)
                proj1 = ""
                if projstrs[0] != 'None':
                    proj1 = base64.b64decode(projstrs[0])
                proj2 = ""
                if projstrs[1] != 'None':
                    proj2 = base64.b64decode(projstrs[1])
                proj3 = ""
                if projstrs[2] != 'None':
                    proj3 = base64.b64decode(projstrs[2])

                projTokens = gen_sentence_features(proj1, vocab, seg)
                projTokens += gen_sentence_features(proj2, vocab, seg)
                projTokens += gen_sentence_features(proj3, vocab, seg)

                assert (len(workTokens) == (3 * MAX_TOKEN_NUM_PER_SENTENCE))
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        "target":
                        tf.train.Feature(int64_list=tf.train.Int64List(
                            value=[target])),
                        "target_orgId":
                        tf.train.Feature(int64_list=tf.train.Int64List(
                            value=[target_orgId])),
                        "gender":
                        tf.train.Feature(int64_list=tf.train.Int64List(
                            value=[gender])),
                        "age":
                        tf.train.Feature(int64_list=tf.train.Int64List(
                            value=[age])),
                        "location":
                        tf.train.Feature(int64_list=tf.train.Int64List(
                            value=[location])),
                        "education_schools":
                        tf.train.Feature(int64_list=tf.train.Int64List(value=[
                            edu_expr1.school, edu_expr2.school,
                            edu_expr3.school
                        ])),
                        "education_degrees":
                        tf.train.Feature(int64_list=tf.train.Int64List(value=[
                            edu_expr1.degree, edu_expr2.degree,
                            edu_expr3.degree
                        ])),
                        "education_starts":
                        tf.train.Feature(float_list=tf.train.FloatList(value=[
                            edu_expr1.start, edu_expr2.start, edu_expr3.start
                        ])),
                        "education_majors":
                        tf.train.Feature(int64_list=tf.train.Int64List(value=[
                            edu_expr1.major, edu_expr2.major, edu_expr3.major
                        ])),
                        "work_expr_orgs":
                        tf.train.Feature(int64_list=tf.train.Int64List(value=[
                            work_expr1.org, work_expr2.org, work_expr3.org
                        ])),
                        "work_expr_starts":
                        tf.train.Feature(float_list=tf.train.FloatList(value=[
                            work_expr1.start, work_expr2.start,
                            work_expr3.start
                        ])),
                        "work_expr_durations":
                        tf.train.Feature(float_list=tf.train.FloatList(value=[
                            work_expr1.duaration, work_expr2.duaration,
                            work_expr3.duaration
                        ])),
                        "work_expr_jobs":
                        tf.train.Feature(int64_list=tf.train.Int64List(value=[
                            work_expr1.job, work_expr2.job, work_expr3.job
                        ])),
                        "work_expr_orgIds":
                        tf.train.Feature(int64_list=tf.train.Int64List(value=[
                            work_expr1.orgId, work_expr2.orgId,
                            work_expr3.orgId
                        ])),
                        "work_expr_descs":
                        tf.train.Feature(int64_list=tf.train.Int64List(
                            value=workTokens)),
                        "proj_expr_descs":
                        tf.train.Feature(int64_list=tf.train.Int64List(
                            value=projTokens)),
                    }))
                if random.random() <= testRatio:
                    writerTest.write(example.SerializeToString())
                    nneg += 1
                else:
                    writerTrain.write(example.SerializeToString())
                    npos += 1
                if processed % 200 == 0:
                    print("processed %d, neg:%d, pos:%d....." %
                          (processed, nneg, npos))
    print("max len of sentences:%d" % (maxTokens))
    writerTrain.close()
    writerTest.close()