def get_abbreviation_dict(sentences):
    if type(sentences) is list:
        abbreviation_dict = schwartz_hearst.extract_abbreviation_definition_pairs(
            doc_text=" ".join(sentences), most_common_definition=True)
    elif type(sentences) is str:
        abbreviation_dict = schwartz_hearst.extract_abbreviation_definition_pairs(
            doc_text=sentences, most_common_definition=True)
    return abbreviation_dict
示例#2
0
def get_abbreviations(text):
    """
    Get list of abbreviations in sentence
    """
    pairs = schwartz_hearst.extract_abbreviation_definition_pairs(
        doc_text=text)
    return pairs
示例#3
0
def extractAcronymsAndPhrases(text):
  pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=text)
  phraseAcronymLinks = {}
  for acronym, phrase in pairs.items():
    print(phrase)
    phraseAcronymLinks[toLemmas(str(phrase))] = acronym.lower()
  return phraseAcronymLinks
示例#4
0
def run_schwartz_algorithm():
    global db_acronyms
    for file in os.listdir("./original_text"):
        pairs = schwartz_hearst.extract_abbreviation_definition_pairs(
            file_path="./original_text/" + file)
        result = {'document_id': file, 'acronyms': []}
        for key, value in pairs.items():
            result['acronyms'].append({'acronym': key, 'full_form': value})
            insert_acronym(file, key, value)
        db_acronyms.append(result)
def find_abbreviations(text_docs: List[str]) -> Dict[str, str]:
    '''
    This method is used to find the list of abbreviations in the document.
    It returns a dictionary of type - abbr. : full_form
    '''
    pairs = {}
    for doc in text_docs:
        found = schwartz_hearst.extract_abbreviation_definition_pairs(
            doc_text=doc, most_common_definition=True)
        pairs.update(found)
    return pairs
示例#6
0
def get_abbreviations():
    count = 1
    for case in all_cases:
        if case[0] == ".":
            continue
        with open("{}/All_FT/{}".format(ENV["DATASET_PATH"], case),
                  'r') as file:
            count += 1
            # print(count)
            file_content = file.read()
            pairs = schwartz_hearst.extract_abbreviation_definition_pairs(
                doc_text=file_content)

            for pair in pairs:
                flag = 0

                upper_case, lower_case = function(pair)
                if (lower_case == 0):
                    flag = 1

                words = pairs[pair].split(' ')

                upper_case = 0
                lower_case = 0
                for word in words:
                    upper_case_1, lower_case_1 = function(word)
                    upper_case += upper_case_1
                    lower_case += lower_case_1

                if lower_case == 0 or upper_case / lower_case > 0.8:
                    flag = 1

                if flag == 0:
                    if pair not in ignore:
                        ignore[pair] = []
                        ignore[pair].append(pairs[pair])
                    else:
                        if pairs[pair] not in ignore[pair]:
                            ignore[pair].append((pairs[pair]))
                else:
                    if pair not in abb:
                        abb[pair] = []
                        abb[pair].append(pairs[pair])
                    else:
                        if pairs[pair] not in abb[pair]:
                            abb[pair].append((pairs[pair]))
示例#7
0
def main(file,mode):
    dict_abbr={}
    head=False
    t=[]

    for l in preprocess_file(file):
        i=l.split('\t')
        if head==False:
            head=True
        else:
            if len(i)<1:
                import pdb; pdb.set_trace()
                continue
            else:
                t.append(i)
                '''document+="\n"+i[3] '''
    dict_abbr=schwartz_hearst.extract_abbreviation_definition_pairs(tagged_text=t)
    return dict_abbr
def extract_expansions(acronyms, use_cached=True):
    print('Extracting expansions from Pubmed...')
    out_fn = './data/derived/pubmed_acronym_expansions.json'
    # TODO use_cached = True because script is not runnable right now
    use_cached = True
    if use_cached and os.path.exists(out_fn):
        return out_fn
    lt_files = os.listdir('./abstracts')
    acronyms = collections.defaultdict(list)
    for fname in lt_files:
        if fname.endswith('.txt'):
            pairs = schwartz_hearst.extract_abbreviation_definition_pairs(
                file_path='./abstracts/' + fname)
            for acronym, expansion in pairs.items():
                acronyms[acronym].append(expansion)
    with open(out_fn, 'w') as f:
        json.dump(acronyms, f)
    return out_fn
示例#9
0
def step_impl(context):
    context.result = schwartz_hearst.extract_abbreviation_definition_pairs(
        doc_text=context.text)
示例#10
0
#open the pdf object
root_dir = '/home/nightingale/Documents/jeff_dev/Hobby_Projects/'

accro_dict = {}
#for filename in glob.iglob(root_dir + '**/*.pdf', recursive=True):
for filename in glob.iglob(root_dir + ('**/*.pdf' or '**/*.doc'),
                           recursive=True):
    if '.doc' or '.pdf' in filename:
        pdf_obj = open(filename, 'rb')
        pdf_reader = PyPDF2.PdfFileReader(pdf_obj)

        for (page_number) in range(pdf_reader.numPages):
            page_obj = pdf_reader.getPage(page_number)
            text = page_obj.extractText()
            pairs = schwartz_hearst.extract_abbreviation_definition_pairs(
                doc_text=text, most_common_definition=True)
            accro_dict.update(pairs)
            print(accro_dict)

df = pd.DataFrame()
df['ACRONYM'] = accro_dict.keys()
df['MEANING'] = accro_dict.values()
df.to_csv('example1.csv', index=False)  #break

# By default, the most recently encountered definition for each term is returned
#pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text= text)

# pairs = schwartz_hearst.extract_abbreviation_definition_pairs(file_path='<path_to_file>')

# # If multiple definitions are encountered for each term, you might want to return the most common for each
# pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text='...', most_common_definition=True)