    def tokenize_string_column(self,
        Tokenizes string literals by assigning one splitted part (e.g., keyword) per row. The input dataframe must have
        at most two columns: one column containing strings to be tokenized, and one column contaning ids(optional)


            Data_Frame (updated self)

            >>> import pandas

            >>> #===== TOKENIZING A SINGLE-COLUMN DATAFRAME ============================================================
            >>> # Make a single-column dataframe:
            >>> df = pandas.DataFrame({'the only column': ('a; b', 'c; d; e')})
            >>> my_Data_Frame = Data_Frame(df)
            >>> print(my_Data_Frame.dataframe)
              the only column
            0            a; b
            1         c; d; e

            >>> # Tokenize strings in single-column dataframe
            >>> my_Data_Frame.tokenize_string_column(string_column_name='the only column',
            ...                                      delimiter_pattern_in_literal_cells='; ')\
              the only column
            0               a
            1               b
            2               c
            3               d
            4               e

            >>> #===== TOKENIZING A TWO-COLUMN DATAFRAME ===============================================================
            >>> # Create a simple dataframe
            >>> my_dataframe = pandas.DataFrame({
            ...      'literal_column':['literal one; literal two', 'literal three; literal four'],
            ...      'id_column': ['id 1', 'id 2']
            ... })

            >>> # Tokenize and view the dataframe
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.tokenize_string_column(string_column_name='literal_column',
            ...                                                    id_column_name='id_column',
            ...                                                    delimiter_pattern_in_literal_cells='; ')\
              id_column literal_column
            0      id 1    literal one
            1      id 1    literal two
            2      id 2  literal three
            3      id 2   literal four

            >>> #===== TOKENIZING WITH REMOVAL OF SPACES BEFORE/AFTER TOKENS ===========================================
            >>> # Unwanted spaces occur when a single character (e.g., ',') is provided as delimiter instead of ('; ').

            >>> # Create a simple dataframe
            >>> my_dataframe = pandas.DataFrame({
            ...      'literal_column':['literal one ; literal two', 'literal three; literal four '],
            ...      'id_column': ['id 1', 'id 2']
            ... })

            >>> # Tokenize and view the dataframe
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.tokenize_string_column(string_column_name='literal_column',
            ...                                                    id_column_name='id_column',
            ...                                                    delimiter_pattern_in_literal_cells=';')\
              id_column literal_column
            0      id 1    literal one
            1      id 1    literal two
            2      id 2  literal three
            3      id 2   literal four

            >>> #===== TOKENIZING IN CASES WHERE DELIMITERS ARE AT HEAD AND TAIL =======================================
            >>> # Create a simple dataframe
            >>> my_dataframe = pandas.DataFrame({
            ...      'literal_column':['tail issue a; tail issue b;', ';head issue a; head issue b', ';both issues a; both issues b;', 'no issues a; no issues b'],
            ...      'id_column': ['id 1', 'id 2', 'id 3', 'id 4']
            ... })

            >>> # Tokenize and view the dataframe
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.tokenize_string_column(string_column_name='literal_column',
            ...                                                    id_column_name='id_column',
            ...                                                    delimiter_pattern_in_literal_cells='; ')\
              id_column literal_column
            0      id 1   tail issue a
            1      id 1   tail issue b
            2      id 2   head issue a
            3      id 2   head issue b
            4      id 3  both issues a
            5      id 3  both issues b
            6      id 4    no issues a
            7      id 4    no issues b

            >>> #===== TOKENIZING IN CASES WHERE DELIMITER(S) ARE THE ENTIRE STRING=====================================
            >>> # Create a simple dataframe
            >>> my_dataframe = pandas.DataFrame({
            ...      'literal_column':[';;;', ';;', ';', '; ;', 'non-problematic a; non-problematic b'],
            ...      'id_column': ['id 1', 'id 2', 'id 3', 'id 4', 'id 5']
            ... })

            >>> # Tokenize and view the dataframe
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.tokenize_string_column(string_column_name='literal_column',
            ...                                                    id_column_name='id_column',
            ...                                                    delimiter_pattern_in_literal_cells='; ')\
              id_column     literal_column
            0      id 5  non-problematic a
            1      id 5  non-problematic b

            >>> #===== A REAL WORLD TWO-COLUMN EXAMPLE =================================================================
            >>> # Create a dataframe
            >>> my_dataframe = pandas.DataFrame({'wosKeywords': ['Clinical Neurology; Orthopedics', 'Biology; Mathematical & Computational Biology', 'Physics, Nuclear', 'Plant Sciences'],
            ...                                  'articleId': ['wosres:WOS_000071013000007', 'wosres:WOS_000071018600001', 'wosres:WOS_000071021600006', 'wosres:WOS_000071040300005']})
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
                                articleId                                    wosKeywords
            0  wosres:WOS_000071013000007                Clinical Neurology; Orthopedics
            1  wosres:WOS_000071018600001  Biology; Mathematical & Computational Biology
            2  wosres:WOS_000071021600006                               Physics, Nuclear
            3  wosres:WOS_000071040300005                                 Plant Sciences

            >>> # Tokenize the string column
            >>> my_Data_Frame.tokenize_string_column(string_column_name='wosKeywords',
            ...                                                           id_column_name='articleId',
            ...                                                           delimiter_pattern_in_literal_cells='; ')\
                                articleId                           wosKeywords
            0  wosres:WOS_000071013000007                    Clinical Neurology
            1  wosres:WOS_000071013000007                           Orthopedics
            2  wosres:WOS_000071018600001                               Biology
            3  wosres:WOS_000071018600001  Mathematical & Computational Biology
            4  wosres:WOS_000071021600006                      Physics, Nuclear
            5  wosres:WOS_000071040300005                        Plant Sciences

            >>> #===== ERROR: DATAFRAME HAS TOO MANY COLUMNS ===========================================================
            >>> # Create a simple dataframe
            >>> my_dataframe = pandas.DataFrame({
            ...      'literal_column':['literal one; literal two', 'literal three; literal four'],
            ...     'id_column': ['id 1', 'id 2'],
            ...     'third_column': ['abc', 'xyz']
            ... })
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
              id_column               literal_column third_column
            0      id 1     literal one; literal two          abc
            1      id 2  literal three; literal four          xyz

            >>> # Error: The input dataframe has too many columns:
            >>> try: my_Data_Frame.tokenize_string_column(string_column_name='literal_column',
            ...                                                         id_column_name='id_column',
            ...                                                         delimiter_pattern_in_literal_cells='; ')
            ... except IndexError as exception:  # catch exception
            ...     print (exception)
            'tokenize_string_column' method can only take a Pandas.DataFrame with two columns. The current number of columns is 3.

        import pandas
        from preprocessor.string_tools import String

        number_of_columns = self.dataframe.shape[1]
        if number_of_columns > 2:
            raise IndexError(
                "'tokenize_string_column' method can only take a Pandas.DataFrame with two columns. "
                "The current number of columns is %s." % number_of_columns)

        # get index positions of columns
        index_of_literal_column = self.dataframe.columns.get_loc(
        if id_column_name:
            index_of_id_column = self.dataframe.columns.get_loc(id_column_name)

        # tokenize literals at row level
        literal_column = self.dataframe[string_column_name]
        splitted_literal_column = literal_column.str.split(
        # update the column
        self.dataframe[string_column_name] = splitted_literal_column

        # create blank dataframe for output
        original_column_names = list(self.dataframe.columns)
        output_dataframe = pandas.DataFrame(columns=original_column_names)

        # create a new row for each tokenized literal
        for each_row_number, each_row in self.dataframe.iterrows():

            column_names = each_row.index.values
            row_values = each_row.values

            for each_literal in row_values[index_of_literal_column]:

                # Clean from unwanted spaces at head and tail of tokens
                each_literal = String(each_literal)
                    ' ')
                each_literal = str(each_literal)

                if len(
                ) > 0:  # do not allow empty rows to be part of the output dataframe

                    if id_column_name:
                        output_dataframe.loc[len(output_dataframe)] = (
                            row_values[index_of_id_column], each_literal)
                        output_dataframe.loc[len(output_dataframe)] = (

        self.dataframe = output_dataframe
        return self
    def purify_column(self, target_column_name):
        Cleans the specified column from undesirable characters.

            target_column_name(str): Column to be cleaned

            Data_Frame (updated self)

            >>> import pandas

            >>> # CLEAN A COLUMN =======================================================================================
            >>> # Create Data_Frame
            >>> my_dataframe = pandas.DataFrame({
            ...             'dirty_column':['{string} & one','String, "two"','[string] - three','(string) /\ four;'],
            ...             'id_column': ['id 1', 'id 2', 'id 3', 'id 4'],
            ...             'another_column': ['abc', 'mno', 'pqr', 'xyz']})
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
              another_column       dirty_column id_column
            0            abc     {string} & one      id 1
            1            mno      String, "two"      id 2
            2            pqr   [string] - three      id 3
            3            xyz  (string) /\ four;      id 4

            >>> # Clean the column
            >>> my_Data_Frame.purify_column('dirty_column')\
              another_column    dirty_column id_column
            0            abc  string and one      id 1
            1            mno     String, two      id 2
            2            pqr  string - three      id 3
            3            xyz     string four      id 4
            >>> #=======================================================================================================

            >>> # EXCEPTION: COLUMN MUST CONSIST OF STRINGS ============================================================
            >>> # Create a column that is made of integers
            >>> my_dataframe = pandas.DataFrame({
            ...      'integer_column':[1,
            ...                      2,
            ...                      3,
            ...                      4
            ...      ]
            ... })
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
            0               1
            1               2
            2               3
            3               4

            >>> # Fail to purify integer column
            >>> try:
            ...     my_Data_Frame.purify_column('integer_column')
            ... except Exception as exception:  # catch exception
            ...     print (exception)
            The target column "integer_column" must be of dtype "object". It is currently of dtype "int64".
            >>> #=======================================================================================================

        from preprocessor.string_tools import String

        target_column = self.dataframe[target_column_name]

        # Target column must be made of strings
                                dtype='object')  # 'O' stands for 'object'
        # a string columns is categorized as 'object'

        conversion_dictionary = {
            '/': '',
            '',  # sometimes, a semicolon seems to be at the end of keywords (e.g.,instead of "kw1; kw2; kw3"
            # "kw1; kw2; kw3;")
            '&': 'and',
            '\(|\)': '',  # ()
            '\[|\]': '',  # []
            '\{|\}': '',  # {}
            '  ':
            ' '  # clean from double spaces (may occur after cleaning other characters)

        # Purify each string in the column
        for i, each_item in enumerate(target_column):
            each_String = String(each_item)
            target_column.loc[i] = each_String.content

        return self
    def clean_heads_and_tails_of_cells_in_column_from_patterns(
            self, target_column_name, patterns_to_remove, location):
        Cleans the specified strings in the column from specified characters at the heads, tails (or at both locations).

            target_column_name(str): Column to be cleaned
            patterns_to_remove(list): A list of strings containing patterns to remove.

        Keyword Args:
            head (patterns_to_remove): Cleans the beginning of the string from specified patterns
            tail (patterns_to_remove): Cleans the end of the string
            ends (patterns_to_remove): Cleans both the beginning and end of the string

            Data_Frame (updated self)

            >>> # INIT =================================================================================================
            >>> # Create Data_Frame
            >>> import pandas as pd
            >>> my_dataframe = pd.DataFrame({
            ...             'dirty_column':[';head issue','tail issue;',';both issues;',';complex situation; head',
            ...             'complex situation; tail;', ';complex situation; both;'],
            ...             'id_column': ['id 1', 'id 2', 'id 3', 'id 4', 'id5', 'id6'],
            ...             'another_column': ['abc', 'def', 'mno', 'pqr', 'stu', 'xyz']})
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
              another_column               dirty_column id_column
            0            abc                ;head issue      id 1
            1            def                tail issue;      id 2
            2            mno              ;both issues;      id 3
            3            pqr   ;complex situation; head      id 4
            4            stu   complex situation; tail;       id5
            5            xyz  ;complex situation; both;       id6

            # CLEAN HEAD ===============================================================================================
            >>> # Clean the heads of strings (without touching the same pattern elsewhere)
            >>> my_Data_Frame.clean_heads_and_tails_of_cells_in_column_from_patterns('dirty_column', [';'], 'head')\
              another_column              dirty_column id_column
            0            abc                head issue      id 1
            1            def               tail issue;      id 2
            2            mno              both issues;      id 3
            3            pqr   complex situation; head      id 4
            4            stu  complex situation; tail;       id5
            5            xyz  complex situation; both;       id6

            # CLEAN TAIL ===============================================================================================
            >>> # Clean the tails of strings (without touching the same pattern elsewhere)
            >>> my_Data_Frame.clean_heads_and_tails_of_cells_in_column_from_patterns('dirty_column', [';'], 'tail')\
              another_column             dirty_column id_column
            0            abc               head issue      id 1
            1            def               tail issue      id 2
            2            mno              both issues      id 3
            3            pqr  complex situation; head      id 4
            4            stu  complex situation; tail       id5
            5            xyz  complex situation; both       id6

            # CLEAN BOTH ===============================================================================================
            >>> # Recreate Data_Frame
            >>> import pandas as pd
            >>> my_dataframe = pd.DataFrame({
            ...             'dirty_column':[';head issue','tail issue;',';both issues;',';complex situation; head',
            ...             'complex situation; tail;', ';complex situation; both;'],
            ...             'id_column': ['id 1', 'id 2', 'id 3', 'id 4', 'id5', 'id6'],
            ...             'another_column': ['abc', 'def', 'mno', 'pqr', 'stu', 'xyz']})
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
              another_column               dirty_column id_column
            0            abc                ;head issue      id 1
            1            def                tail issue;      id 2
            2            mno              ;both issues;      id 3
            3            pqr   ;complex situation; head      id 4
            4            stu   complex situation; tail;       id5
            5            xyz  ;complex situation; both;       id6

            >>> # Clean both the heads and tails of strings (without touching the same pattern elsewhere)
            >>> # Note that when the target it 'both', removal proceeds ONLY if the pattern exists at both head and tail
            >>> my_Data_Frame.clean_heads_and_tails_of_cells_in_column_from_patterns('dirty_column',
            ...                                                                      patterns_to_remove=[';'],
            ...                                                                      location='both')\
              another_column              dirty_column id_column
            0            abc               ;head issue      id 1
            1            def               tail issue;      id 2
            2            mno               both issues      id 3
            3            pqr  ;complex situation; head      id 4
            4            stu  complex situation; tail;       id5
            5            xyz   complex situation; both       id6

            >>> # EXCEPTION: COLUMN MUST CONSIST OF STRINGS ============================================================
            >>> # Create a column that is made of integers
            >>> my_dataframe = pd.DataFrame({
            ...      'integer_column':[1,
            ...                      2,
            ...                      3,
            ...                      4
            ...      ]
            ... })
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
            0               1
            1               2
            2               3
            3               4

            >>> # Fail to clean integer column from characters
            >>> try:
            ...     my_Data_Frame.clean_heads_and_tails_of_cells_in_column_from_patterns('integer_column', [';'], 'head')
            ... except Exception as exception:  # catch exception
            ...     print (exception)
            The target column "integer_column" must be of dtype "object". It is currently of dtype "int64".
            >>> #=======================================================================================================
        from preprocessor.string_tools import String

        target_column = self.dataframe[target_column_name]

        # Target column must be made of strings
                                dtype='object')  # 'O' stands for 'object'
        # a string columns is categorized as 'object'

        # Clean each string in the column from the specified characters
        for i, each_item in enumerate(target_column):
            each_String = String(each_item)

                patterns_to_remove=patterns_to_remove, location=location)
            target_column.loc[i] = each_String.content

        return self

my_row = CSV_Line(' "a" , "b" , "c" ,').clean_from_newline_characters().\
    clean_head_and_tail_from_patterns(' ', 'head').\
    clean_head_and_tail_from_patterns(' ,', 'tail').\
    parse_line_and_CONVERT_to_CSV_Row(' , ').\
    clean_cell_heads_and_tails_from_characters('"')  # usage of chain methods/fluid interface is optional


my_row.format_for_print_and_CONVERT_to_CSV_Line(column_separator=' , ',
                                                line_head=' ',
                                                line_tail=' ,',

String('ABC123').clip_at_index(4, remove='tail')

####################### CSV EXPORT #######################

demo_bibliography = Bibliography()
        'b_document', 'b_authors', 'b_topics', 'b_journal',
        'b_publication_month', 'b_issue_number', 'b_volume', 'b_pages',
Retrieves all articles from OpenCitations that has the same DOI with the records in VU and UvA bibliographies.

# parse list from file (probably exists in ListData)
from retriever.sparql_tools import Open_Citations_Query
from meta.consoleOutput import ConsoleOutput
from preprocessor.string_tools import String

console = ConsoleOutput('log.txt')

doi_list = []
with open('Input//all_dois_in_uva_and_vu_bibliographies.csv',
          encoding='utf8') as doi_file:
    for each_line in doi_file:
        each_line = String(each_line)

oc_query = Open_Citations_Query()
oc_query.retrieve_articles_by_dois(doi_list, show_progress_bar=True)

# A demo list with 100 DOIs
# doi_list = ['10.1163/187607508X384689', '10.1017/S0954579416000572', '10.1007/s11562-016-0353-7', '10.1016/j.adolescence.2016.09.008', '10.1186/s13561-016-0122-6', '10.1007/s00799-016-0182-6', '10.5194/gmd-2016-266', '10.1007/s00737-015-0531-2', '10.1103/RevModPhys.88.021003', 'https://doi.org/10.1101/167171', 'https://doi.org/10.1016/j.chb.2017.04.047', '10.1016/j.trb.2016.09.005', '10.1016/j.ancene.2016.01.001', '10.1111/adb.12322', '10.1017/njg.2016.45', '10.1080/1359432X.2016.1209489', '10.1117/1.JBO.21.6.066008', '10.5194/gmd-10-3329-2017', '10.1016/j.rser.2017.01.103', '10.1177/2050157916664559', '10.1007/978-3-319-45931-8_17', '10.1007/s11136-015-1171-8', '10.1145/2991079.2991121', '10.1093/cz/zow089', '10.1126/science.aac8167', '10.1007/s00586-016-4606-1', '10.1186/s12937-017-0229-6', '10.1007/s11357-016-9894-1', '10.1080/00130095.2015.1094371', '10.1016/j.epsl.2016.02.028', '10.1371/journal.pone.0168636', '10.1016/j.atmosres.2016.03.016', '10.1111/deci.12206', '10.1126/science.aad9634', '10.1103/PhysRevA.94.012506', '10.4103/0019-5545.196846', '10.1016/j.cedpsych.2017.01.006', '10.3324/haematol.2015.133470', '10.1057/978-1-137-50956-7', '10.1016/j.scico.2016.04.001', 'https://doi.org/10.1016/j.scico.2016.04.001', '10.1080/03081087.2015.1053425', '10.3758/s13423-017-1270-3', '10.1681/ASN.2015030287', '10.1016/j.avb.2016.05.006', '10.1177/0971333616689191', '10.1002/sej.1243', '10.1016/j.foreco.2017.06.023', '10.1103/PhysRevLett.118.071801', 'https://doi.org/10.1093/geront/gnv127', '10.1007/978-3-319-42324-1_16', '10.1109/JBHI.2015.2412656', '10.1016/j.jeem.2016.04.002', '10.1080/00207543.2015.1058982', '10.1038/mp.2016.100', '10.1080/03003930.2016.1194267', '10.1016/j.envint.2017.01.018', '10.1038/pr.2015.179', '10.1177/1753193416669263', '10.1016/j.tre.2016.11.003', '10.1021/acs.jpcc.5b12016', '10.1002/anie.201603510', '10.1073/pnas.1607005113', '(DOI) - 10.1111/cch.12521', '10.1017/S0016756815000886', '10.1080/1350293X.2015.1073507', '10.1152/jn.00701.2015', '10.1371/journal.pone.0170791', '10.1016/j.seares.2016.07.005', '10.1016/j.reseneeco.2016.03.003', '10.1007/s00531-017-1499-0', '10.1007/s41669-017-0014-7', '10.1093/acrefore/9780190228613.013.439', '10.14814/phy2.13201', '10.1016/j.jtrangeo.2016.10.013', '10.1523/JNEUROSCI.3658-16.2017', '10.1192/bjpo.bp.115.000166', '10.1136/bmjgh-2016-000109', '10.7554/eLife.20320.001', '10.1037/pas0000332', '10.1177/1474704916673841', '10.1057/978-1-137-58179-2', '10.1002/ejp.963', '10.1017/thg.2016.78', '10.1038/tpj.2016.32', '10.1016/j.jesp.2017.03.008', '10.1287/trsc.2015.0647', '10.1186/s13015-016-0087-3', '10.1016/j.neuroimage.2016.10.030', '10.1371/journal.pone.0169109', '10.1007/s11367-017-1358-z', '10.1080/1369183X.2015.1061425', '10.2196/mental.4614', '10.1002/arp.1564', '10.1021/acs.orglett.6b01023', '10.3847/1538-4357/aa6c47', 'http://www.socialevraagstukken.nl/veiligheid-creeer-je-met-geborgenheid/', '10.1186/s12888-016-0790-0', '10.1371/journal.pone.0155755', '10.1103/PhysRevLett.116.241801']
    def get_line_at_position_from_file(self, line_number):
        Returns a specified line from the TextFile without reading the whole file into memory.

            line_index(int): A value that can take integers starting from 0.

            String class object (created from string at line in file).

        See Also:

            >>> # return first line of file
            >>> my_file = Text_File('test_data//example_merged_yasgui_1000.csv')
            >>> my_file.get_line_at_position_from_file(1)
            '"publication_type" , "journal_article" , "title" , "publication_year" , "author_name" , "journal_name" , "journal_issue_number" , "journal_volume_number" , "startEndPages" , "publisher_name" , "doi" , "cited_by_article" ,'

            >>> # return another line
            >>> my_file.get_line_at_position_from_file(122)
            '"Journal Article" , "https://w3id.org/oc/corpus/br/3448" , "Perioperative Myocardial Infarction" , "2009" , "Beattie - W. S. | Mosseri - M. | Jaffe - A. S. | Alpert - J. S." , "Circulation" , "22" , "119" , "2936--2944" , "Ovid Technologies (Wolters Kluwer Health)" , "10.1161/circulationaha.108.828228" , "https://w3id.org/oc/corpus/br/3426" ,'

            >>> # return last line
            >>> my_file.get_line_at_position_from_file(267)
            '"Journal Article" , "https://w3id.org/oc/corpus/br/3437" , "Myocardial Injury after Noncardiac Surgery" , "2014" , "Niebrzegowska - Edyta | Benton - Sally | Wragg - Andrew | Archbold - Andrew | Smith - Amanda | McAlees - Eleanor | Ramballi - Cheryl | MacDonald - Neil | Januszewska - Marta | Shariffuddin - Ina I. | Vasanthan - V. | Hashim - N. H. M. | Undok - A. Wahab | Ki - Ushananthini | Lai - Hou Yee | Ahmad - Wan Azman | Ackland - Gareth | Khan - Ahsun | Almeida - Smitha | Cherian - Joseph | Furruqh - Sultana | Abraham - Valsa | Paniagua - Pilar | Urrutia - Gerard | Maestre - Mari Luz | Santaló - Miquel | Gonzalez - Raúl | Font - Adrià | Martínez - Cecilia" , "Anesthesiology" , "3" , "120" , "564--578" , "Ovid Technologies (Wolters Kluwer Health)" , "10.1097/aln.0000000000000113" , "https://w3id.org/oc/corpus/br/3522 | https://w3id.org/oc/corpus/br/300243 | https://w3id.org/oc/corpus/br/3062326 | https://w3id.org/oc/corpus/br/3271454 | https://w3id.org/oc/corpus/br/3879533 | https://w3id.org/oc/corpus/br/4205354 | https://w3id.org/oc/corpus/br/5253819 | https://w3id.org/oc/corpus/br/6332120 | https://w3id.org/oc/corpus/br/7799424 | https://w3id.org/oc/corpus/br/8003885 | https://w3id.org/oc/corpus/br/8185544" ,'

            >>> # erroneous index number entered (0)
            >>> # return first line of file
            >>> my_file = Text_File('test_data//example_merged_yasgui_1000.csv')
            >>> try:
            ...     my_file.get_line_at_position_from_file(0) #  line_number cannot be 0
            ... except Exception as error_message:
            ...     print('Exception: ' + str(error_message))
            Exception: Parameter value must be a positive integer but is "0" of <class 'int'>.

            >>> # erroneous index number entered (too high)
            >>> try:
            ...     my_file.get_line_at_position_from_file(300) # there is no 300th line in the file
            ... except IndexError as error_message:
            ...     print('Exception: ' + str(error_message))
            Exception: Requested line number '300' does not exist in file.
        from preprocessor.string_tools import String, Parameter_Value

        with open(self.input_file_path, encoding='utf8') as input_file:
            line = None

            for i, each_line in enumerate(input_file):
                current_iteration_step = i + 1  # to align index numbers (starting from 0) and line numbers (start from 1)
                if current_iteration_step == line_number:
                    line = String(each_line)
                elif current_iteration_step > line_number:

            if line == None:
                raise IndexError(
                    "Requested line number '%s' does not exist in file." %

            # if not cleaned from '\n', comparisons and operations tend to be problematic
            # write to file with base print() function to get back the new line in the end

            return line
    def is_each_row_balanced(self, exclude_special_rows_of_syntax=None):
        Checks whether each row in buffer is balanced (i.e., does not have unmatched parantheses, brackets, etc). Can
        exclude special row types (e.g., comment) from evaluation.

            exclude_special_rows_of_syntax(str): specifies what type of rows to exclude from evaluation
                (e.g., comment rows). Uses predefined syntax settings per specified syntax (e.g., 'bibtex').

        Keyword Args:
            - bibtex (exclude_special_rows_of_syntax): sets evaluation exclusion criteria for bibtex syntax


            >>> # an unbalanced row is present
            >>> my_buffer = ListBuffer()
            >>> my_buffer.append_row(['a', 'b', 'c']).append_row(['d', 'e', 'f']).dataset
            [['a', 'b', 'c'], ['d', 'e', 'f']]
            >>> my_buffer.append_row(['g', 'h' , '>'])\

            >>> # single row from a bib file
            >>> my_buffer = ListBuffer()
            >>> my_buffer.append_row('            year      = "2017",')\

            >>> # bibtex entry start (no exception vs. exception)
            >>> my_buffer.append_row('@article{96d9add3e2f44e8abbf030170689bc30,')\
            >>> my_buffer.is_each_row_balanced(exclude_special_rows_of_syntax='bibtex')

            >>> # bibtex comment (no exception vs. exception)
            >>> my_buffer = ListBuffer()
            >>> my_buffer.append_row('% This is a comment with an unbalanced characters }]>')\
            >>> my_buffer.is_each_row_balanced(exclude_special_rows_of_syntax='bibtex')

            >>> # a full bibtex entry with an unbalanced curly bracket at title field
            >>> my_buffer = ListBuffer()
            >>> my_buffer.dataset = ['@book{a82caf00e1a143759c7f5543b6c84ea5,', 'title     = "{Knowledge Representation for Health Care (AIME 2015 International Joint Workshop, KR4HC/ProHealth 2015)",', 'author    = "D Riano and R. Lenz and S Miksch and M Peleg and M. Reichert and {ten Teije}, A.C.M.",', 'year      = "2015",', 'doi       = "10.1007/978-3-319-26585-8",', 'isbn      = "9783319265841",', 'series    = "LNAI",', 'publisher = "Springer",', 'number    = "9485",', '}', '']
            >>> my_buffer.is_each_row_balanced(exclude_special_rows_of_syntax='bibtex')  # error
            >>> # the same entry with unbalanced curly bracket removed
            >>> my_buffer.dataset = ['@book{a82caf00e1a143759c7f5543b6c84ea5,', 'title     = "Knowledge Representation for Health Care (AIME 2015 International Joint Workshop, KR4HC/ProHealth 2015)",', 'author    = "D Riano and R. Lenz and S Miksch and M Peleg and M. Reichert and {ten Teije}, A.C.M.",', 'year      = "2015",', 'doi       = "10.1007/978-3-319-26585-8",', 'isbn      = "9783319265841",', 'series    = "LNAI",', 'publisher = "Springer",', 'number    = "9485",', '}', '']
            >>> my_buffer.is_each_row_balanced(exclude_special_rows_of_syntax='bibtex')


        from preprocessor.string_tools import String

        buffer = self.dataset

        is_balanced_log = []

        for each_row in buffer:
            each_row = String(str(each_row))

            if not each_row.is_balanced():
                # print('row is not balanced: ', each_row)
                ### EXCLUSIONS FOR BIBTEX ###########################################
                if exclude_special_rows_of_syntax == 'bibtex':
                    # print('special syntax = bibtex recognized')
                    # forgive these row types
                    if each_row.is_line_type('bibtex', 'start of entry') \
                            or each_row.is_line_type('bibtex', 'end of entry') \
                            or each_row.is_line_type('bibtex', 'comment'):
                        # print("01: appended True to log, because the row is unbalanced but it passed exclusion rules", "the current row (each_row) is: ", "(", type(each_row) ,")",  each_row)

                    # print("02: appended False to log because row is unbalanced (no exclusion keyword specified) ", "the current row (each_row) is: ", "(", type(each_row) ,")",  each_row)

                # print("03: appended True to log because row is balanced ", "the current row (each_row) is: ", "(", type(each_row) ,")",  each_row)

        if False in is_balanced_log:
            return False
            return True
    def cleanAndTokenizeCsv(instance):
        Imports the .csv file as raw text, cleans it (if cleaning algorithm is specified), and then tokenizes it.

            List containing parsed data from the .csv file. For each row in the .csv file (including headers row), a
                sub-list is created in the main list.

            >>> from preprocessor.Text_File import Text_File
            >>> my_file = Text_File('example_data//problematic_yasgui_csv_file.csv')
            >>> my_file.print_lines(2)
            "27624462" , "2016" , "Journal Article" , "Duku - Stephen Kwasi☆☆ Opokué | Asenso-Boadi - Francis" , "[]{}\ '<Utilization, of, healthcare services and renewal of health insurance membership: evidence of adverse selection in Ghana" , "Springer Science + Business Media" , "1" , "Health Econ Rev - Health Economics Review" , "http://dx.doi.org/10.1186/s13561-016-0122-6" , "6" , "10.1186/s13561-016-0122-6" , "" , "https://w3id.org/oc/corpus/br/3555801" , "https://w3id.org/oc/corpus/br/18754 | https://w3id.org/oc/corpus/br/18792" ,

            >>> my_csv_bibliography = CSV_Bibliography(
            ...                           csv_file_path='example_data//problematic_yasgui_csv_file.csv',
            ...                           id_column_header='journal_article',
            ...                           field_value_list_separator=' | ',
            ...                           csv_delimiter_character=',',
            ...                           cleaning_algorithm='default'
            ... )
            Conversion from ListData to Bibliography object started
            Conversion completed. 2 out of 2 ListData rows converted to Bibliography object entries
            >>> my_csv_bibliography.preview(1) # notice the character conversions in the 'authors' and 'title' fields
            ----------------------------------ENTRY 1----------------------------------
             {'': '',
              'authors': ['Duku - Stephen Kwasiaa OpokuAS', 'Asenso-Boadi - Francis'],
              'cited_by_the_articles': '',
              'cited_the_articles': ['https://w3id.org/oc/corpus/br/18754',
              'doi': '10.1186/s13561-016-0122-6',
              'journal_article': 'https://w3id.org/oc/corpus/br/3555801',
              'journal_issue_number': '1',
              'journal_name': 'Health Econ Rev - Health Economics Review',
              'journal_volume_number': '6',
              'pmid': '27624462',
              'publication_type': 'Journal Article',
              'publication_year': '2016',
              'publisher_name': 'Springer Science + Business Media',
              'title': ' Utilization-of-healthcare services and renewal of health '
                       'insurance membership: evidence of adverse selection in Ghana',
              'url': 'http://dx.doi.org/10.1186/s13561-016-0122-6'})
        import re
        import csv
        from os import remove as os_remove
        from preprocessor.string_tools import String

        # open the csv file and read it to a variable
        imported_file_raw = open(instance.csv_file_path,
        imported_string_raw = imported_file_raw.read()

        # if no cleaning algorithm is specified, skip cleaning and just tokenize
        if instance.cleaning_algorithm == 'parse only':
            imported_string_cleaned = imported_string_raw

        # otherwise, run cleaning algorithm
        elif instance.cleaning_algorithm == 'default':
            # TODO: The current way to remove in-string commas is tuned for OpenCitations data with yasgui style CSV. Make a generic version by using a while loop (see commented out draft below).
            # clean commas that occur in entry field values (i.e., within strings)
            imported_string_cleaned = re.sub(' ,', '_-_-_',
            imported_string_cleaned = re.sub(', ', '-',
            imported_string_cleaned = re.sub('_-_-_', ' ,',

            # clean CSV file from double quotes
            imported_string_cleaned = re.sub(' "|" ', '',

            # clean from characters and patterns that are generally problematic for parsing operations
            imported_string_cleaned = String(imported_string_cleaned)
            imported_string_cleaned = str(imported_string_cleaned)

            # # Draft while loop for a more generic future algorithm to replace in-string commas:
            # between_quotes = False
            # for i, each_character in enumerate(imported_string_cleaned):
            #    if between_quotes:
            #        if each_character == ",":
            #            imported_string_cleaned[i] = "-"
            #            print(imported_string_cleaned)
            #    # first occurrence
            #    if each_character == '\"' and not between_quotes:
            #        between_quotes = True
            #    elif each_character == '\"' and between_quotes:
            # importCleanedCsvbetween_quotes = False
        # if the cleaning_algorithm parameter is not recognized, return error
            raise ValueError('Unknown algorithm type: ' +
                             instance.cleaning_algorithm +
                             '. Please enter a valid algorithm string.')

        # close the original csv file (no changes made to it)

        # create a temporary file to hold the cleaned csv file (a file is needed for csv() function)
        cleaned_file_path = "temp_cleaned.csv"
        cleaned_csv_file = open(cleaned_file_path, mode="w", encoding="utf8")

        # read from the temporary file and tokenize it
        cleaned_csv_file = open(cleaned_file_path, mode="r", encoding="utf8")
        cleaned_csv_file_content = list(

        # remove the temporary file

        return cleaned_csv_file_content