with codecs.open(args.dict_file, 'rb', 'latin1') as f: lines = f.readlines() count = 0 spelling_vars = {} for line in lines: count += 1 entry = line.split(';') # lexicon service needs lower case input term = entry[0].lower() term = term.replace('"', '') while True: try: sleep(1) words = get_spelling_variants(term, [], 1600, 1830) words = list(set(words)) break except: print 'Retry!' sleep(5) pass if len(words) > 0: spelling_vars[term] = words if count % 1000 == 0: print count print term, words
liwc_category_output = [] spelling_vars = {} liwc_output = {} for line in lines: # legend if line[0].isdigit() or line.startswith(('%', '\r')): liwc_category_output.append(line.strip()) # word else: entry = line.split() # lexicon service needs lower case input term = entry[0].lower() categories = entry[1:] sleep(0.3) words = get_spelling_variants(term, categories, 1600, 1830) words.append(term) words = list(set(words)) spelling_vars[term] = words print term, words for word in words: if liwc_output.get( word) and not categories == liwc_output[word]: new_c = list(set(categories + liwc_output.get(word))) new_c.sort() liwc_output[word] = new_c else: liwc_output[word] = categories #with codecs.open('liwc_output.json', 'w', 'utf8') as f: