def to_txt(dir='resumes/'): ''' convert the CVs to plain text and save a mapping of their id and path ''' i = 0 # numeric id files = pr.explore(dir) # get list of all supported files # lists of cv details cv = [] cv_txt = [] cv_id = [] for f in files: if (pr.parse(f, i) == 1): # add cv details cv_id.append(i) cv.append(f) cv_txt.append('corpus/op/' + str(i) + '.txt') i += 1 d = { 'cid': cv_id, 'cv': cv, 'txt': cv_txt } # make dataframe of cv-id-path mapping df = pd.DataFrame(d) df.set_index('cid') print(df) df.to_csv('db.csv')
def train_model(model_name, corpus_dir = 'corpus/tokenized/'): files = pr.explore(corpus_dir) tkns = [] for file in files: with open(file, 'rb') as f: tkns.append(pickle.load(f)) model = Word2Vec(tkns, size=300, window=5, sg=1) model.save(model_name) return True
def prepare_corpus(corpus_dir='corpus/'): txt_dir = 'op/' tkn_dir = 'tokenized/' files = pr.explore(corpus_dir + txt_dir) # list of all plain text CVs in corpus/op/ print("Files: {}".format(len(files))) for file in files: with open(file, 'r') as f: content = f.read() # tokenize, remove stopwords and stem content = nltk.word_tokenize(content) content = [ PorterStemmer().stem(word) for word in content if word not in stopwords.words('english') and word.isalnum() ] with open( corpus_dir + tkn_dir + file.split('/')[-1].split('.')[0] + '.pkl', 'wb') as pfile: pkl.dump(content, pfile) # dump tokenized text
def upload(): form = UploadJD() results = {} if form.validate_on_submit(): results = run(form.jd.data, pr.explore('corpus/tokenized/')) return render_template("jd.html", form=form, results=results)
for i in range(0, len(vcv)): res.append((1 - spatial.distance.cosine(vjd, vcv[i][0]), vcv[i][1])) #for i in range(0, len(cvc)): # retrieval.append((1 - spatial.distance.cosine(vjd, vcv[i][0]), vcv[i][1])) res.sort(reverse=True) print(res[:15]) with open('oput.pkl', 'wb') as op: pkl.dump(res, op) df = pd.read_csv("db.csv") response = {'cid':[], 'score':[], 'path':[]} for i in range(20): response['cid'].append(res[i][1]) response['score'].append("{:.1f}".format(res[i][0]*100)) response['path'].append(df[df['cid']==int(res[i][1])]['cv'].values[0]) return response if __name__ == '__main__': with open('jd.txt', 'r') as jd: gg = jd.read() result = run(gg, pr.explore('corpus/tokenized')) print(pd.DataFrame(result))
if not (os.path.exists('resumes')): os.makedirs('resumes/doc2/') if 'doc2' not in os.listdir('resumes'): os.makedirs('resumes/doc2/') if not (os.path.exists('corpus')): os.makedirs('corpus/op/') os.makedirs('corpus/tokenized/') if 'op' not in os.listdir('corpus'): os.makedirs('corpus/op/') if 'tokenized' not in os.listdir('corpus'): os.makedirs('corpus/tokenized/') print("[Directory Structure OK]") # check if CVs are available in resumes folder if pr.explore('resumes/') == []: print("No CVs available, please put CVs in 'resumes' folder.") exit() # (re)create corpus files = pr.explore('corpus/op/') if files != []: ch = input('Recreate corpus? (y/n)') if ch.lower() in ['y', 'yes']: for tmp in pr.explore('resumes/doc2'): os.remove(tmp) for tmp in files: os.remove(tmp) create_txts('resumes') print('[Conversion to plain text done]') prepare_corpus('corpus/')
""" PROBABLY WON'T RUN ON WINDOWS """ import os import glob import subprocess import parsers as pr files = pr.explore(os.getcwd()) for f in files: if (f.endswith(".doc")): subprocess.call([ 'soffice', '--headless', '--convert-to', 'docx', '--outdir', os.path.join(os.getcwd(), 'doc2'), f ])