forked from haroonrasheed333/NLPCareerTrajectory
/
iBeyond_corpus_builder.py
235 lines (175 loc) · 7.71 KB
/
iBeyond_corpus_builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import os
import re
import time
import random
import shutil
import progressbar
from collections import Counter
from lxml import etree
from util import stripxml
from collections import defaultdict
from job_title_normalization import normalize_job_titles
user_name = os.environ.get('USER')
f = open('extracted_data/top_titles.txt', 'rb')
top_jobs = [t.strip().lower() for t in f.readlines()]
job_count = defaultdict(int)
def split_data(labels_list, paths):
"""
Function to split the dataset into training and heldout datasets
Parameters:
-----------
labels_list -- list
List of tuples with filename and tag information for each resume
paths -- dict
Dict consisting of source and destination directories for data
"""
# Path where the sample text resumes are present
source_dir = paths['main_source_directory'] + '/' + paths['plaintext_data_directory']
# Store the training and heldout data in different directories.
training_dir = paths['main_source_directory'] + '/' + paths['training_directory']
heldout_dir = paths['main_source_directory'] + '/' + paths['heldout_directory']
random.seed(int(time.time()))
random.shuffle(labels_list)
num_files = len(labels_list)
# Split the training and heldout files
training_files = labels_list[:int(num_files*0.8)]
heldout_files = labels_list[int(num_files*0.8):]
labels = open(paths['main_source_directory'] + '/' + paths['labels_file_path'], 'w')
labels_heldout = open(paths['main_source_directory'] + '/' + paths['labels_heldout_file_path'], 'w')
for (filename, tag) in training_files:
shutil.copy2(source_dir + '/' + filename, training_dir)
labels.writelines(filename + "\t" + tag + "\n")
for (filename, tag) in heldout_files:
shutil.copy2(source_dir + '/' + filename, heldout_dir)
labels_heldout.writelines(filename + "\t" + tag + "\n")
labels.close()
labels_heldout.close()
def clean_data_and_extract_job_titles(fname, paths, names, job_titles, labels_list):
"""
Function to clean data and extract job titles from resume.
Parameters:
-----------
fname - string.
Name of the resume file
paths - dict
Dict containing paths of source directories
names - string.
Extracted candidate names from resume. Used to remove duplicate resumes.
job_titles - list.
Titles extracted from resume
labels_list - list.
Titles that will be used as labels for classifier.
Returns:
--------
names - string.
Extracted candidate names from resume. Used to remove duplicate resumes.
job_titles - list.
Titles extracted from resume
labels_list - list.
Titles that will be used as labels for classifier.
"""
source_dir = paths['main_source_directory'] + '/' + paths['xml_data_directory']
try:
xml = etree.parse(source_dir + '/' + fname)
# Extract the current job title, and current job element from xml
current_job_title = xml.xpath('//job[@end = "present"]/title/text()')
current_job_title = normalize_job_titles(current_job_title)
current_job = xml.xpath('//job[@end = "present"]')
# Extract the contact information from xml.
contact = xml.xpath('//contact')
# Since there are many duplicate resumes in the data, filter out the resumes based on candidate name.
# Extract the candidate name from the resume
name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0]
# Avoid duplicate resumes by only choosing resumes with unique names
if name not in names:
names.append(name)
else:
return names, job_titles, labels_list
# Remove the candidate contact information from the resume.
if contact:
contact[0].getparent().remove(contact[0])
# Remove the current job section from the resume as we will be using current job title as lable and
# use our algorithm to predict it.
if current_job:
if len(current_job) > 1:
i = 0
while i < len(current_job):
current_job[i].getparent().remove(current_job[i])
i += 1
else:
current_job[0].getparent().remove(current_job[0])
# Convert xml to string.
xml = etree.tostring(xml, pretty_print=True)
# Strip the xml tags from the resume.
text_data = stripxml(xml)
i = 0
flag = 0
# From the resume text remove all the words matching the current job title as we do not want any
# information about the current job in the resume text.
if current_job_title:
text_data = text_data.replace(current_job_title[0].strip(), '')
if current_job_title[0].strip() in top_jobs:
flag = 1
job_count[current_job_title[0].strip()] += 1
# Only save the resumes whose current job title is present in the top 100 jobs
if flag == 1 and job_count[current_job_title[0].strip()] < 300:
if current_job_title:
job_titles.append(current_job_title[0].strip())
directory = paths['main_source_directory'] + '/' + paths['plaintext_data_directory'] + '/'
f = open(directory + '%s' % fname[:-4] + '_plaintext.txt', 'w')
f.write(text_data)
f.close()
labels_list.append((fname[:-4] + '_plaintext.txt', current_job_title[0].strip().replace('\n', '')))
return names, job_titles, labels_list
except:
return names, job_titles, labels_list
def prepare_data(paths):
"""
Function to prepare data and split training and test data.
Parameters:
-----------
paths - dict
Dict containing paths of source directories
"""
source_dir = paths['main_source_directory'] + '/' + paths['xml_data_directory']
# Get the files from the source directory
files = [f for (dirpath, dirnames, filenames) in os.walk(source_dir) for f in filenames if f[-4:] == '.txt']
random.seed(int(time.time()))
random.shuffle(files)
names = []
job_titles = []
labels = []
j, bar = 0, pbar(len(files))
bar.start()
# From each xml file extract the information and store in plaintext files.
for f in files:
# Create an xml parser object
(names, job_titles, labels_list) = clean_data_and_extract_job_titles(f, paths, names, job_titles, labels)
j += 1
bar.update(j)
bar.finish()
# Split the saved resumes (resumes belonging to top 100 job titles) into training and heldout datasets.
print Counter(job_titles)
split_data(labels, paths)
return
def pbar(size):
"""
Function to display the progress of a long running operation.
"""
bar = progressbar.ProgressBar(maxval=size,
widgets=[progressbar.Bar('=', '[', ']'),
' ', progressbar.Percentage(),
' ', progressbar.ETA(),
' ', progressbar.Counter(),
'/%s' % size])
return bar
if __name__ == "__main__":
paths = dict()
paths['main_source_directory'] = '/Users/' + user_name + '/Documents/Data/'
paths['xml_data_directory'] = 'samples_0426'
paths['plaintext_data_directory'] = 'samples_0426_text'
paths['training_directory'] = 'training_0426'
paths['heldout_directory'] = 'heldout_0426'
paths['labels_file_path'] = 'labels_0426.txt'
paths['labels_heldout_file_path'] = 'labels_heldout_0426.txt'
prepare_data(paths)