示例#1
0
def to_txt(dir='resumes/'):
    '''
		convert the CVs to plain text and save a mapping of their id and path		
	'''
    i = 0  # numeric id
    files = pr.explore(dir)  # get list of all supported files

    # lists of cv details
    cv = []
    cv_txt = []
    cv_id = []

    for f in files:
        if (pr.parse(f, i) == 1):
            # add cv details
            cv_id.append(i)
            cv.append(f)
            cv_txt.append('corpus/op/' + str(i) + '.txt')
            i += 1

    d = {
        'cid': cv_id,
        'cv': cv,
        'txt': cv_txt
    }  # make dataframe of cv-id-path mapping
    df = pd.DataFrame(d)
    df.set_index('cid')
    print(df)
    df.to_csv('db.csv')
示例#2
0
def train_model(model_name, corpus_dir = 'corpus/tokenized/'):
	files = pr.explore(corpus_dir)
	tkns = [] 
	for file in files:
		with open(file, 'rb') as f:
			tkns.append(pickle.load(f))

	model = Word2Vec(tkns, size=300, window=5, sg=1)
	model.save(model_name)
	return True
def prepare_corpus(corpus_dir='corpus/'):
    txt_dir = 'op/'
    tkn_dir = 'tokenized/'

    files = pr.explore(corpus_dir +
                       txt_dir)  # list of all plain text CVs in corpus/op/
    print("Files: {}".format(len(files)))

    for file in files:
        with open(file, 'r') as f:
            content = f.read()
            # tokenize, remove stopwords and stem
            content = nltk.word_tokenize(content)
            content = [
                PorterStemmer().stem(word) for word in content
                if word not in stopwords.words('english') and word.isalnum()
            ]
            with open(
                    corpus_dir + tkn_dir + file.split('/')[-1].split('.')[0] +
                    '.pkl', 'wb') as pfile:
                pkl.dump(content, pfile)  # dump tokenized text
示例#4
0
def upload():
    form = UploadJD()
    results = {}
    if form.validate_on_submit():
        results = run(form.jd.data, pr.explore('corpus/tokenized/'))
    return render_template("jd.html", form=form, results=results)
	for i in range(0, len(vcv)):
		res.append((1 - spatial.distance.cosine(vjd, vcv[i][0]), vcv[i][1]))
	#for i in range(0, len(cvc)):
	#   retrieval.append((1 - spatial.distance.cosine(vjd, vcv[i][0]), vcv[i][1]))

	res.sort(reverse=True)
	print(res[:15])
	
	with open('oput.pkl', 'wb') as op:
		pkl.dump(res, op)

	df = pd.read_csv("db.csv")
	response = {'cid':[], 'score':[], 'path':[]}
	for i in range(20):
		response['cid'].append(res[i][1])
		response['score'].append("{:.1f}".format(res[i][0]*100))
		response['path'].append(df[df['cid']==int(res[i][1])]['cv'].values[0])

	return response

if __name__ == '__main__':
	with open('jd.txt', 'r') as jd:
		gg = jd.read()

	result = run(gg, pr.explore('corpus/tokenized'))
	print(pd.DataFrame(result))




if not (os.path.exists('resumes')):
    os.makedirs('resumes/doc2/')
if 'doc2' not in os.listdir('resumes'):
    os.makedirs('resumes/doc2/')
if not (os.path.exists('corpus')):
    os.makedirs('corpus/op/')
    os.makedirs('corpus/tokenized/')
if 'op' not in os.listdir('corpus'):
    os.makedirs('corpus/op/')
if 'tokenized' not in os.listdir('corpus'):
    os.makedirs('corpus/tokenized/')

print("[Directory Structure OK]")

# check if CVs are available in resumes folder
if pr.explore('resumes/') == []:
    print("No CVs available, please put CVs in 'resumes' folder.")
    exit()

# (re)create corpus
files = pr.explore('corpus/op/')
if files != []:
    ch = input('Recreate corpus? (y/n)')
    if ch.lower() in ['y', 'yes']:
        for tmp in pr.explore('resumes/doc2'):
            os.remove(tmp)
        for tmp in files:
            os.remove(tmp)
        create_txts('resumes')
        print('[Conversion to plain text done]')
        prepare_corpus('corpus/')
"""
PROBABLY WON'T RUN ON WINDOWS 
"""
import os
import glob
import subprocess
import parsers as pr

files = pr.explore(os.getcwd())
for f in files:
    if (f.endswith(".doc")):
        subprocess.call([
            'soffice', '--headless', '--convert-to', 'docx', '--outdir',
            os.path.join(os.getcwd(), 'doc2'), f
        ])